1022 files changed, 104760 insertions, 5397 deletions
diff --git a/test/CodeGen/AArch64/adrp-relocation.ll b/test/CodeGen/AArch64/adrp-relocation.ll
deleted file mode 100644
index 1e12d69..0000000
--- a/test/CodeGen/AArch64/adrp-relocation.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -filetype=obj < %s | llvm-readobj -s -r | FileCheck %s
-
-define i64 @testfn() nounwind {
-entry:
-  ret i64 0
-}
-
-define i64 @foo() nounwind {
-entry:
-  %bar = alloca i64 ()*, align 8
-  store i64 ()* @testfn, i64 ()** %bar, align 8
-  %call = call i64 @testfn()
-  ret i64 %call
-}
-
-; The above should produce an ADRP/ADD pair to calculate the address of
-; testfn. The important point is that LLVM shouldn't think it can deal with the
-; relocation on the ADRP itself (even though it knows everything about the
-; relative offsets of testfn and foo) because its value depends on where this
-; object file's .text section gets relocated in memory.
-
-; CHECK:      Relocations [
-; CHECK-NEXT:   Section (2) .rela.text {
-; CHECK-NEXT:     0x10 R_AARCH64_ADR_PREL_PG_HI21 testfn 0x0
-; CHECK-NEXT:     0x14 R_AARCH64_ADD_ABS_LO12_NC testfn 0x0
-; CHECK-NEXT:   }
-; CHECK-NEXT: ]
diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll
index a84217f..1d3c0a0 100644
--- a/test/CodeGen/AArch64/alloca.ll
+++ b/test/CodeGen/AArch64/alloca.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP %s
 
 declare void @use_addr(i8*)
 
@@ -7,13 +8,13 @@ define void @test_simple_alloca(i64 %n) {
 
   %buf = alloca i8, i64 %n
   ; Make sure we align the stack change to 16 bytes:
-; CHECK: add [[SPDELTA:x[0-9]+]], x0, #15
-; CHECK: and x0, [[SPDELTA]], #0xfffffffffffffff0
+; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15
+; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0
 
   ; Make sure we change SP. It would be surprising if anything but x0 were used
   ; for the final sp, but it could be if it was then moved into x0.
-; CHECK: mov [[TMP:x[0-9]+]], sp
-; CHECK: sub x0, [[TMP]], [[SPDELTA]]
+; CHECK-DAG: mov [[TMP:x[0-9]+]], sp
+; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]]
 ; CHECK: mov sp, x0
 
   call void @use_addr(i8* %buf)
@@ -37,13 +38,13 @@ define i64 @test_alloca_with_local(i64 %n) {
   %loc = alloca i64
   %buf = alloca i8, i64 %n
   ; Make sure we align the stack change to 16 bytes:
-; CHECK: add [[SPDELTA:x[0-9]+]], x0, #15
-; CHECK: and x0, [[SPDELTA]], #0xfffffffffffffff0
+; CHECK-DAG: add [[SPDELTA:x[0-9]+]], x0, #15
+; CHECK-DAG: and x0, [[SPDELTA]], #0xfffffffffffffff0
 
   ; Make sure we change SP. It would be surprising if anything but x0 were used
   ; for the final sp, but it could be if it was then moved into x0.
-; CHECK: mov [[TMP:x[0-9]+]], sp
-; CHECK: sub x0, [[TMP]], [[SPDELTA]]
+; CHECK-DAG: mov [[TMP:x[0-9]+]], sp
+; CHECK-DAG: sub x0, [[TMP]], [[SPDELTA]]
 ; CHECK: mov sp, x0
 
   ; Obviously suboptimal code here, but it to get &local in x1
@@ -66,16 +67,22 @@ define i64 @test_alloca_with_local(i64 %n) {
 }
 
 define void @test_variadic_alloca(i64 %n, ...) {
-; CHECK-LABEL: test_variadic_alloca:
+; CHECK: test_variadic_alloca:
 
 ; CHECK: sub     sp, sp, #208
 ; CHECK: stp     x29, x30, [sp, #192]
 ; CHECK: add     x29, sp, #192
 ; CHECK: sub     [[TMP:x[0-9]+]], x29, #192
 ; CHECK: add     x8, [[TMP]], #0
-; CHECK: str     q7, [x8, #112]
+; CHECK-FP: str     q7, [x8, #112]
 ; [...]
-; CHECK: str     q1, [x8, #16]
+; CHECK-FP: str     q1, [x8, #16]
+
+; CHECK-NOFP: sub     sp, sp, #80
+; CHECK-NOFP: stp     x29, x30, [sp, #64]
+; CHECK-NOFP: add     x29, sp, #64
+; CHECK-NOFP: sub     [[TMP:x[0-9]+]], x29, #64
+; CHECK-NOFP: add     x8, [[TMP]], #0
 
   %addr = alloca i8, i64 %n
 
@@ -86,6 +93,10 @@ define void @test_variadic_alloca(i64 %n, ...) {
 ; CHECK: sub sp, x29, #192
 ; CHECK: ldp x29, x30, [sp, #192]
 ; CHECK: add sp, sp, #208
+
+; CHECK-NOFP: sub sp, x29, #64
+; CHECK-NOFP: ldp x29, x30, [sp, #64]
+; CHECK-NOFP: add sp, sp, #80
 }
 
 define void @test_alloca_large_frame(i64 %n) {
@@ -112,16 +123,16 @@ declare i8* @llvm.stacksave()
 declare void @llvm.stackrestore(i8*)
 
 define void @test_scoped_alloca(i64 %n) {
-; CHECK: test_scoped_alloca
+; CHECK-LABEL: test_scoped_alloca:
 ; CHECK: sub sp, sp, #32
 
   %sp = call i8* @llvm.stacksave()
 ; CHECK: mov [[SAVED_SP:x[0-9]+]], sp
+; CHECK: mov [[OLDSP:x[0-9]+]], sp
 
   %addr = alloca i8, i64 %n
 ; CHECK: and [[SPDELTA:x[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
-; CHECK: mov [[OLDSP:x[0-9]+]], sp
-; CHECK: sub [[NEWSP:x[0-9]+]], [[OLDSP]], [[SPDELTA]]
+; CHECK-DAG: sub [[NEWSP:x[0-9]+]], [[OLDSP]], [[SPDELTA]]
 ; CHECK: mov sp, [[NEWSP]]
 
   call void @use_addr(i8* %addr)
diff --git a/test/CodeGen/AArch64/basic-pic.ll b/test/CodeGen/AArch64/basic-pic.ll
index 1b14be2..682b7ba 100644
--- a/test/CodeGen/AArch64/basic-pic.ll
+++ b/test/CodeGen/AArch64/basic-pic.ll
@@ -1,10 +1,7 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -filetype=obj %s -o -| llvm-objdump -r - | FileCheck --check-prefix=CHECK-ELF %s
 
 @var = global i32 0
 
-; CHECK-ELF: RELOCATION RECORDS FOR [.rela.text]
-
 define i32 @get_globalvar() {
 ; CHECK-LABEL: get_globalvar:
 
@@ -13,8 +10,6 @@ define i32 @get_globalvar() {
 ; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], #:got_lo12:var]
 ; CHECK: ldr w0, [x[[GOTLOC]]]
 
-; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var
-; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var
   ret i32 %val
 }
 
@@ -25,8 +20,6 @@ define i32* @get_globalvaraddr() {
 ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var
 ; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:var]
 
-; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var
-; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var
   ret i32* @var
 }
 
@@ -39,8 +32,6 @@ define i32 @get_hiddenvar() {
 ; CHECK: adrp x[[HI:[0-9]+]], hiddenvar
 ; CHECK: ldr w0, [x[[HI]], #:lo12:hiddenvar]
 
-; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 hiddenvar
-; CHECK-ELF: R_AARCH64_LDST32_ABS_LO12_NC hiddenvar
   ret i32 %val
 }
 
@@ -51,8 +42,6 @@ define i32* @get_hiddenvaraddr() {
 ; CHECK: adrp [[HI:x[0-9]+]], hiddenvar
 ; CHECK: add x0, [[HI]], #:lo12:hiddenvar
 
-; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 hiddenvar
-; CHECK-ELF: R_AARCH64_ADD_ABS_LO12_NC hiddenvar
   ret i32* @hiddenvar
 }
 
@@ -62,9 +51,4 @@ define void()* @get_func() {
   ret void()* bitcast(void()*()* @get_func to void()*)
 ; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func
 ; CHECK: ldr x0, [x[[GOTHI]], #:got_lo12:get_func]
-
-  ; Particularly important that the ADRP gets a relocation, LLVM tends to think
-  ; it can relax it because it knows where get_func is. It can't!
-; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE get_func
-; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC get_func
 }
diff --git a/test/CodeGen/AArch64/cond-sel.ll b/test/CodeGen/AArch64/cond-sel.ll
index 48c50a1..9c1dfeb 100644
--- a/test/CodeGen/AArch64/cond-sel.ll
+++ b/test/CodeGen/AArch64/cond-sel.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -9,16 +10,16 @@ define void @test_csel(i32 %lhs32, i32 %rhs32, i64 %lhs64) {
   %tst1 = icmp ugt i32 %lhs32, %rhs32
   %val1 = select i1 %tst1, i32 42, i32 52
   store i32 %val1, i32* @var32
-; CHECK: movz [[W52:w[0-9]+]], #52
-; CHECK: movz [[W42:w[0-9]+]], #42
+; CHECK-DAG: movz [[W52:w[0-9]+]], #52
+; CHECK-DAG: movz [[W42:w[0-9]+]], #42
 ; CHECK: csel {{w[0-9]+}}, [[W42]], [[W52]], hi
 
   %rhs64 = sext i32 %rhs32 to i64
   %tst2 = icmp sle i64 %lhs64, %rhs64
   %val2 = select i1 %tst2, i64 %lhs64, i64 %rhs64
   store i64 %val2, i64* @var64
-; CHECK: cmp [[LHS:x[0-9]+]], [[RHS:w[0-9]+]], sxtw
-; CHECK: sxtw [[EXT_RHS:x[0-9]+]], [[RHS]]
+; CHECK-DAG: cmp [[LHS:x[0-9]+]], [[RHS:w[0-9]+]], sxtw
+; CHECK-DAG: sxtw [[EXT_RHS:x[0-9]+]], [[RHS]]
 ; CHECK: csel {{x[0-9]+}}, [[LHS]], [[EXT_RHS]], le
 
   ret void
@@ -30,6 +31,7 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
 
   %tst1 = fcmp one float %lhs32, %rhs32
 ; CHECK: fcmp {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFP-NOT: fcmp
   %val1 = select i1 %tst1, i32 42, i32 52
   store i32 %val1, i32* @var32
 ; CHECK: movz [[W52:w[0-9]+]], #52
@@ -40,6 +42,7 @@ define void @test_floatcsel(float %lhs32, float %rhs32, double %lhs64, double %r
 
   %tst2 = fcmp ueq double %lhs64, %rhs64
 ; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK-NOFP-NOT: fcmp
   %val2 = select i1 %tst2, i64 9, i64 15
   store i64 %val2, i64* @var64
 ; CHECK: movz [[CONST15:x[0-9]+]], #15
diff --git a/test/CodeGen/AArch64/directcond.ll b/test/CodeGen/AArch64/directcond.ll
index 13f032d..12c7b6a 100644
--- a/test/CodeGen/AArch64/directcond.ll
+++ b/test/CodeGen/AArch64/directcond.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 ; CHECK-LABEL: test_select_i32:
@@ -26,7 +27,7 @@ define float @test_select_float(i1 %bit, float %a, float %b) {
 ; CHECK: movz [[ONE:w[0-9]+]], #1
 ; CHECK: tst w0, [[ONE]]
 ; CHECK-NEXT: fcsel s0, s0, s1, ne
-
+; CHECK-NOFP-NOT: fcsel
   ret float %val
 }
 
@@ -36,6 +37,7 @@ define double @test_select_double(i1 %bit, double %a, double %b) {
 ; CHECK: movz [[ONE:w[0-9]+]], #1
 ; CHECK: tst w0, [[ONE]]
 ; CHECK-NEXT: fcsel d0, d0, d1, ne
+; CHECK-NOFP-NOT: fcsel
 
   ret double %val
 }
@@ -56,6 +58,7 @@ define i1 @test_setcc_float(float %lhs, float %rhs) {
   %val = fcmp oeq float %lhs, %rhs
 ; CHECK: fcmp s0, s1
 ; CHECK: csinc w0, wzr, wzr, ne
+; CHECK-NOFP-NOT: fcmp
   ret i1 %val
 }
 
@@ -64,6 +67,7 @@ define i1 @test_setcc_double(double %lhs, double %rhs) {
   %val = fcmp oeq double %lhs, %rhs
 ; CHECK: fcmp d0, d1
 ; CHECK: csinc w0, wzr, wzr, ne
+; CHECK-NOFP-NOT: fcmp
   ret i1 %val
 }
 
diff --git a/test/CodeGen/AArch64/elf-extern.ll b/test/CodeGen/AArch64/elf-extern.ll
deleted file mode 100644
index e09aa12..0000000
--- a/test/CodeGen/AArch64/elf-extern.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s
-
-; External symbols are a different concept to global variables but should still
-; get relocations and so on when used.
-
-declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
-
-define i32 @check_extern() {
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* undef, i8* undef, i32 undef, i32 4, i1 0)
-  ret i32 0
-}
-
-; CHECK: Relocations [
-; CHECK:   Section (2) .rela.text {
-; CHECK:     0x{{[0-9,A-F]+}} R_AARCH64_CALL26 memcpy
-; CHECK:   }
-; CHECK: ]
diff --git a/test/CodeGen/AArch64/fcvt-int.ll b/test/CodeGen/AArch64/fcvt-int.ll
index 9afcfc4..b28eb3e 100644
--- a/test/CodeGen/AArch64/fcvt-int.ll
+++ b/test/CodeGen/AArch64/fcvt-int.ll
@@ -5,8 +5,8 @@ define i32 @test_floattoi32(float %in) {
 
   %signed = fptosi float %in to i32
   %unsigned = fptoui float %in to i32
-; CHECK: fcvtzu [[UNSIG:w[0-9]+]], {{s[0-9]+}}
-; CHECK: fcvtzs [[SIG:w[0-9]+]], {{s[0-9]+}}
+; CHECK-DAG: fcvtzu [[UNSIG:w[0-9]+]], {{s[0-9]+}}
+; CHECK-DAG: fcvtzs [[SIG:w[0-9]+]], {{s[0-9]+}}
 
   %res = sub i32 %signed, %unsigned
 ; CHECK: sub {{w[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -20,8 +20,8 @@ define i32 @test_doubletoi32(double %in) {
 
   %signed = fptosi double %in to i32
   %unsigned = fptoui double %in to i32
-; CHECK: fcvtzu [[UNSIG:w[0-9]+]], {{d[0-9]+}}
-; CHECK: fcvtzs [[SIG:w[0-9]+]], {{d[0-9]+}}
+; CHECK-DAG: fcvtzu [[UNSIG:w[0-9]+]], {{d[0-9]+}}
+; CHECK-DAG: fcvtzs [[SIG:w[0-9]+]], {{d[0-9]+}}
 
   %res = sub i32 %signed, %unsigned
 ; CHECK: sub {{w[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -35,8 +35,8 @@ define i64 @test_floattoi64(float %in) {
 
   %signed = fptosi float %in to i64
   %unsigned = fptoui float %in to i64
-; CHECK: fcvtzu [[UNSIG:x[0-9]+]], {{s[0-9]+}}
-; CHECK: fcvtzs [[SIG:x[0-9]+]], {{s[0-9]+}}
+; CHECK-DAG: fcvtzu [[UNSIG:x[0-9]+]], {{s[0-9]+}}
+; CHECK-DAG: fcvtzs [[SIG:x[0-9]+]], {{s[0-9]+}}
 
   %res = sub i64 %signed, %unsigned
 ; CHECK: sub {{x[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -50,8 +50,8 @@ define i64 @test_doubletoi64(double %in) {
 
   %signed = fptosi double %in to i64
   %unsigned = fptoui double %in to i64
-; CHECK: fcvtzu [[UNSIG:x[0-9]+]], {{d[0-9]+}}
-; CHECK: fcvtzs [[SIG:x[0-9]+]], {{d[0-9]+}}
+; CHECK-DAG: fcvtzu [[UNSIG:x[0-9]+]], {{d[0-9]+}}
+; CHECK-DAG: fcvtzs [[SIG:x[0-9]+]], {{d[0-9]+}}
 
   %res = sub i64 %signed, %unsigned
 ; CHECK: sub {{x[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -65,8 +65,8 @@ define float @test_i32tofloat(i32 %in) {
 
   %signed = sitofp i32 %in to float
   %unsigned = uitofp i32 %in to float
-; CHECK: ucvtf [[UNSIG:s[0-9]+]], {{w[0-9]+}}
-; CHECK: scvtf [[SIG:s[0-9]+]], {{w[0-9]+}}
+; CHECK-DAG: ucvtf [[UNSIG:s[0-9]+]], {{w[0-9]+}}
+; CHECK-DAG: scvtf [[SIG:s[0-9]+]], {{w[0-9]+}}
 
   %res = fsub float %signed, %unsigned
 ; CHECL: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -79,8 +79,8 @@ define double @test_i32todouble(i32 %in) {
 
   %signed = sitofp i32 %in to double
   %unsigned = uitofp i32 %in to double
-; CHECK: ucvtf [[UNSIG:d[0-9]+]], {{w[0-9]+}}
-; CHECK: scvtf [[SIG:d[0-9]+]], {{w[0-9]+}}
+; CHECK-DAG: ucvtf [[UNSIG:d[0-9]+]], {{w[0-9]+}}
+; CHECK-DAG: scvtf [[SIG:d[0-9]+]], {{w[0-9]+}}
 
   %res = fsub double %signed, %unsigned
 ; CHECK: fsub {{d[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -93,8 +93,8 @@ define float @test_i64tofloat(i64 %in) {
 
   %signed = sitofp i64 %in to float
   %unsigned = uitofp i64 %in to float
-; CHECK: ucvtf [[UNSIG:s[0-9]+]], {{x[0-9]+}}
-; CHECK: scvtf [[SIG:s[0-9]+]], {{x[0-9]+}}
+; CHECK-DAG: ucvtf [[UNSIG:s[0-9]+]], {{x[0-9]+}}
+; CHECK-DAG: scvtf [[SIG:s[0-9]+]], {{x[0-9]+}}
 
   %res = fsub float %signed, %unsigned
 ; CHECK: fsub {{s[0-9]+}}, [[SIG]], [[UNSIG]]
@@ -107,8 +107,8 @@ define double @test_i64todouble(i64 %in) {
 
   %signed = sitofp i64 %in to double
   %unsigned = uitofp i64 %in to double
-; CHECK: ucvtf [[UNSIG:d[0-9]+]], {{x[0-9]+}}
-; CHECK: scvtf [[SIG:d[0-9]+]], {{x[0-9]+}}
+; CHECK-DAG: ucvtf [[UNSIG:d[0-9]+]], {{x[0-9]+}}
+; CHECK-DAG: scvtf [[SIG:d[0-9]+]], {{x[0-9]+}}
 
   %res = fsub double %signed, %unsigned
 ; CHECK: sub {{d[0-9]+}}, [[SIG]], [[UNSIG]]
diff --git a/test/CodeGen/AArch64/fp-dp3.ll b/test/CodeGen/AArch64/fp-dp3.ll
index 3a9a6fc..590557f 100644
--- a/test/CodeGen/AArch64/fp-dp3.ll
+++ b/test/CodeGen/AArch64/fp-dp3.ll
@@ -129,8 +129,9 @@ define float @test_fnmsub_unfused(float %a, float %b, float %c) {
   %diff = fsub float %nega, %prod
 ; CHECK: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK-NOFAST-NOT: fnmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-NOFAST: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-NOFAST: fneg {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK-NOFAST: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST-DAG: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST-DAG: fneg {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST-DAG: fsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-NOFAST: ret
   ret float %diff
 }
diff --git a/test/CodeGen/AArch64/fp128.ll b/test/CodeGen/AArch64/fp128.ll
index 853c03d..c312bb1 100644
--- a/test/CodeGen/AArch64/fp128.ll
+++ b/test/CodeGen/AArch64/fp128.ll
@@ -150,14 +150,14 @@ define i1 @test_setcc2() {
 ; Technically, everything after the call to __letf2 is redundant, but we'll let
 ; LLVM have its fun for now.
   %val = fcmp ugt fp128 %lhs, %rhs
-; CHECK: bl      __unordtf2
-; CHECK: mov     x[[UNORDERED:[0-9]+]], x0
-
 ; CHECK: bl      __gttf2
 ; CHECK: cmp w0, #0
 ; CHECK: csinc   [[GT:w[0-9]+]], wzr, wzr, le
-; CHECK: cmp w[[UNORDERED]], #0
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp w0, #0
 ; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+
 ; CHECK: orr     w0, [[UNORDERED]], [[GT]]
 
   ret i1 %val
@@ -174,15 +174,14 @@ define i32 @test_br_cc() {
 
   ; olt == !uge, which LLVM unfortunately "optimizes" this to.
   %cond = fcmp olt fp128 %lhs, %rhs
-; CHECK: bl      __unordtf2
-; CHECK: mov     x[[UNORDERED:[0-9]+]], x0
-
 ; CHECK: bl      __getf2
 ; CHECK: cmp w0, #0
-
 ; CHECK: csinc   [[OGE:w[0-9]+]], wzr, wzr, lt
-; CHECK: cmp w[[UNORDERED]], #0
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp w0, #0
 ; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+
 ; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
 ; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
   br i1 %cond, label %iftrue, label %iffalse
diff --git a/test/CodeGen/AArch64/fpimm.ll b/test/CodeGen/AArch64/fpimm.ll
index ccf7c8a..b8f7169 100644
--- a/test/CodeGen/AArch64/fpimm.ll
+++ b/test/CodeGen/AArch64/fpimm.ll
@@ -9,12 +9,13 @@ define void @check_float() {
   %val = load float* @varf32
   %newval1 = fadd float %val, 8.5
   store volatile float %newval1, float* @varf32
-; CHECK: fmov {{s[0-9]+}}, #8.5
+; CHECK-DAG: fmov [[EIGHT5:s[0-9]+]], #8.5
 
   %newval2 = fadd float %val, 128.0
   store volatile float %newval2, float* @varf32
-; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI0_0
+; CHECK-DAG: ldr [[HARD:s[0-9]+]], [{{x[0-9]+}}, #:lo12:.LCPI0_0
 
+; CHECK: ret
   ret void
 }
 
@@ -24,11 +25,12 @@ define void @check_double() {
   %val = load double* @varf64
   %newval1 = fadd double %val, 8.5
   store volatile double %newval1, double* @varf64
-; CHECK: fmov {{d[0-9]+}}, #8.5
+; CHECK-DAG: fmov {{d[0-9]+}}, #8.5
 
   %newval2 = fadd double %val, 128.0
   store volatile double %newval2, double* @varf64
-; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI1_0
+; CHECK-DAG: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.LCPI1_0
 
+; CHECK: ret
   ret void
 }
diff --git a/test/CodeGen/AArch64/frameaddr.ll b/test/CodeGen/AArch64/frameaddr.ll
new file mode 100644
index 0000000..182704b
--- /dev/null
+++ b/test/CodeGen/AArch64/frameaddr.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
+
+define i8* @t() nounwind {
+entry:
+; CHECK-LABEL: t:
+; CHECK: mov x0, x29
+	%0 = call i8* @llvm.frameaddress(i32 0)
+        ret i8* %0
+}
+
+define i8* @t2() nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr x[[reg:[0-9]+]], [x29]
+; CHECK: ldr x[[reg]], [x[[reg]]]
+	%0 = call i8* @llvm.frameaddress(i32 2)
+        ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index 15f8e76..430d77f 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -23,6 +24,7 @@ define void @add_floats(float %val1, float %val2) {
 ; CHECK-LABEL: add_floats:
     %newval = fadd float %val1, %val2
 ; CHECK: fadd [[ADDRES:s[0-9]+]], s0, s1
+; CHECK-NOFP-NOT: fadd
     store float %newval, float* @varfloat
 ; CHECK: str [[ADDRES]], [{{x[0-9]+}}, #:lo12:varfloat]
     ret void
@@ -35,15 +37,15 @@ define void @take_struct(%myStruct* byval %structval) {
     %addr0 = getelementptr %myStruct* %structval, i64 0, i32 2
     %addr1 = getelementptr %myStruct* %structval, i64 0, i32 0
 
-    %val0 = load i32* %addr0
+    %val0 = load volatile i32* %addr0
     ; Some weird move means x0 is used for one access
 ; CHECK: ldr [[REG32:w[0-9]+]], [{{x[0-9]+|sp}}, #12]
-    store i32 %val0, i32* @var32
+    store volatile i32 %val0, i32* @var32
 ; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
 
-    %val1 = load i64* %addr1
+    %val1 = load volatile i64* %addr1
 ; CHECK: ldr [[REG64:x[0-9]+]], [{{x[0-9]+|sp}}]
-    store i64 %val1, i64* @var64
+    store volatile i64 %val1, i64* @var64
 ; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
 
     ret void
@@ -56,14 +58,14 @@ define void @check_byval_align(i32* byval %ignore, %myStruct* byval align 16 %st
     %addr0 = getelementptr %myStruct* %structval, i64 0, i32 2
     %addr1 = getelementptr %myStruct* %structval, i64 0, i32 0
 
-    %val0 = load i32* %addr0
+    %val0 = load volatile i32* %addr0
     ; Some weird move means x0 is used for one access
 ; CHECK: add x[[STRUCTVAL_ADDR:[0-9]+]], sp, #16
 ; CHECK: ldr [[REG32:w[0-9]+]], [x[[STRUCTVAL_ADDR]], #12]
     store i32 %val0, i32* @var32
 ; CHECK: str [[REG32]], [{{x[0-9]+}}, #:lo12:var32]
 
-    %val1 = load i64* %addr1
+    %val1 = load volatile i64* %addr1
 ; CHECK: ldr [[REG64:x[0-9]+]], [sp, #16]
     store i64 %val1, i64* @var64
 ; CHECK: str [[REG64]], [{{x[0-9]+}}, #:lo12:var64]
@@ -84,6 +86,7 @@ define double @return_double() {
 ; CHECK-LABEL: return_double:
     ret double 3.14
 ; CHECK: ldr d0, [{{x[0-9]+}}, #:lo12:.LCPI
+; CHECK-NOFP-NOT: ldr d0,
 }
 
 ; This is the kind of IR clang will produce for returning a struct
@@ -130,17 +133,18 @@ define i32 @struct_on_stack(i8 %var0, i16 %var1, i32 %var2, i64 %var3, i128 %var
                           double %notstacked) {
 ; CHECK-LABEL: struct_on_stack:
     %addr = getelementptr %myStruct* %struct, i64 0, i32 0
-    %val64 = load i64* %addr
-    store i64 %val64, i64* @var64
+    %val64 = load volatile i64* %addr
+    store volatile i64 %val64, i64* @var64
     ; Currently nothing on local stack, so struct should be at sp
 ; CHECK: ldr [[VAL64:x[0-9]+]], [sp]
 ; CHECK: str [[VAL64]], [{{x[0-9]+}}, #:lo12:var64]
 
-    store double %notstacked, double* @vardouble
+    store volatile double %notstacked, double* @vardouble
 ; CHECK-NOT: ldr d0
 ; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble
+; CHECK-NOFP-NOT: str d0,
 
-    %retval = load i32* %stacked
+    %retval = load volatile i32* %stacked
     ret i32 %retval
 ; CHECK: ldr w0, [sp, #16]
 }
@@ -176,10 +180,10 @@ define void @check_i128_stackalign(i32 %val0, i32 %val1, i32 %val2, i32 %val3,
 ; CHECK: check_i128_stackalign
     store i128 %stack2, i128* @var128
     ; Nothing local on stack in current codegen, so first stack is 16 away
-; CHECK: ldr {{x[0-9]+}}, [sp, #16]
+; CHECK: add     x[[REG:[0-9]+]], sp, #16
+; CHECK: ldr {{x[0-9]+}}, [x[[REG]], #8]
     ; Important point is that we address sp+24 for second dword
-; CHECK: add     [[REG:x[0-9]+]], sp, #16
-; CHECK: ldr     {{x[0-9]+}}, {{\[}}[[REG]], #8]
+; CHECK: ldr     {{x[0-9]+}}, [sp, #16]
     ret void
 }
 
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index b12130b..ac188bb 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -21,16 +22,18 @@ define void @simple_args() {
   %char1 = load i8* @var8
   %char2 = load i8* @var8_2
   call void @take_i8s(i8 %char1, i8 %char2)
-; CHECK: ldrb w0, [{{x[0-9]+}}, #:lo12:var8]
-; CHECK: ldrb w1, [{{x[0-9]+}}, #:lo12:var8_2]
+; CHECK-DAG: ldrb w0, [{{x[0-9]+}}, #:lo12:var8]
+; CHECK-DAG: ldrb w1, [{{x[0-9]+}}, #:lo12:var8_2]
 ; CHECK: bl take_i8s
 
   %float1 = load float* @varfloat
   %float2 = load float* @varfloat_2
   call void @take_floats(float %float1, float %float2)
-; CHECK: ldr s1, [{{x[0-9]+}}, #:lo12:varfloat_2]
-; CHECK: ldr s0, [{{x[0-9]+}}, #:lo12:varfloat]
+; CHECK-DAG: ldr s1, [{{x[0-9]+}}, #:lo12:varfloat_2]
+; CHECK-DAG: ldr s0, [{{x[0-9]+}}, #:lo12:varfloat]
 ; CHECK: bl take_floats
+; CHECK-NOFP-NOT: ldr s1,
+; CHECK-NOFP-NOT: ldr s0,
 
   ret void
 }
@@ -52,6 +55,7 @@ define void @simple_rets() {
   store double %dbl, double* @vardouble
 ; CHECK: bl return_double
 ; CHECK: str d0, [{{x[0-9]+}}, #:lo12:vardouble]
+; CHECK-NOFP-NOT: str d0,
 
   %arr = call [2 x i64] @return_smallstruct()
   store [2 x i64] %arr, [2 x i64]* @varsmallstruct
@@ -75,17 +79,19 @@ declare void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
                           float %var8)
 
 define void @check_stack_args() {
+; CHECK-LABEL: check_stack_args:
   call i32 @struct_on_stack(i8 0, i16 12, i32 42, i64 99, i128 1,
                             i32* @var32, %myStruct* byval @varstruct,
                             i32 999, double 1.0)
   ; Want to check that the final double is passed in registers and
   ; that varstruct is passed on the stack. Rather dependent on how a
   ; memcpy gets created, but the following works for now.
-; CHECK: mov x0, sp
-; CHECK: str {{w[0-9]+}}, [x0]
-; CHECK: str {{w[0-9]+}}, [x0, #12]
-; CHECK: fmov d0,
+; CHECK: mov x[[SPREG:[0-9]+]], sp
+; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]]]
+; CHECK-DAG: str {{w[0-9]+}}, [x[[SPREG]], #12]
+; CHECK-DAG: fmov d0,
 ; CHECK: bl struct_on_stack
+; CHECK-NOFP-NOT: fmov
 
   call void @stacked_fpu(float -1.0, double 1.0, float 4.0, float 2.0,
                          float -2.0, float -8.0, float 16.0, float 1.0,
diff --git a/test/CodeGen/AArch64/inline-asm-modifiers.ll b/test/CodeGen/AArch64/inline-asm-modifiers.ll
index d1b21f8..b7f4d3c 100644
--- a/test/CodeGen/AArch64/inline-asm-modifiers.ll
+++ b/test/CodeGen/AArch64/inline-asm-modifiers.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-ELF %s
 
 @var_simple = hidden global i32 0
 @var_got = global i32 0
@@ -23,12 +22,10 @@ define void @test_inline_modifier_L() nounwind {
 ; CHECK: ldr x0, [x0, #:gottprel_lo12:var_tlsie]
 ; CHECK: add x0, x0, #:tprel_lo12:var_tlsle
 
-; CHECK-ELF: R_AARCH64_ADD_ABS_LO12_NC var_simple
-; CHECK-ELF: R_AARCH64_LD64_GOT_LO12_NC var_got
-; CHECK-ELF: R_AARCH64_TLSDESC_ADD_LO12_NC var_tlsgd
-; CHECK-ELF: R_AARCH64_TLSLD_ADD_DTPREL_LO12 var_tlsld
-; CHECK-ELF: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC var_tlsie
-; CHECK-ELF: R_AARCH64_TLSLE_ADD_TPREL_LO12 var_tlsle
+  call void asm sideeffect "add x0, x0, ${0:L}", "Si,~{x0}"(i32 64)
+  call void asm sideeffect "ldr x0, [x0, ${0:L}]", "Si,~{x0}"(i32 64)
+; CHECK: add x0, x0, #64
+; CHECK: ldr x0, [x0, #64]
 
   ret void
 }
@@ -40,9 +37,8 @@ define void @test_inline_modifier_G() nounwind {
 ; CHECK: add x0, x0, #:dtprel_hi12:var_tlsld, lsl #12
 ; CHECK: add x0, x0, #:tprel_hi12:var_tlsle, lsl #12
 
-; CHECK-ELF: R_AARCH64_TLSLD_ADD_DTPREL_HI12 var_tlsld
-; CHECK-ELF: R_AARCH64_TLSLE_ADD_TPREL_HI12 var_tlsle
-
+  call void asm sideeffect "add x0, x0, ${0:G}", "Si,~{x0}"(i32 42)
+; CHECK: add x0, x0, #42
   ret void
 }
 
@@ -58,10 +54,8 @@ define void @test_inline_modifier_A() nounwind {
 ; CHECK: adrp x0, :tlsdesc:var_tlsgd
 ; CHECK: adrp x0, :gottprel:var_tlsie
 
-; CHECK-ELF: R_AARCH64_ADR_PREL_PG_HI21 var_simple
-; CHECK-ELF: R_AARCH64_ADR_GOT_PAGE var_got
-; CHECK-ELF: R_AARCH64_TLSDESC_ADR_PAGE var_tlsgd
-; CHECK-ELF: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 var_tlsie
+  call void asm sideeffect "adrp x0, ${0:A}", "Si,~{x0}"(i32 40)
+; CHECK: adrp x0, #40
 
   ret void
 }
@@ -87,6 +81,12 @@ define void @test_inline_modifier_wx(i32 %small, i64 %big) nounwind {
   call i32 asm sideeffect "add ${0:x}, ${1:x}, ${1:x}", "=r,r"(i32 0)
 ; CHECK: add {{w[0-9]+}}, wzr, wzr
 ; CHECK: add {{x[0-9]+}}, xzr, xzr
+
+  call i32 asm sideeffect "add ${0:w}, ${0:w}, ${1:w}", "=r,Ir,0"(i32 123, i32 %small)
+  call i64 asm sideeffect "add ${0:x}, ${0:x}, ${1:x}", "=r,Ir,0"(i32 456, i64 %big)
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #123
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #456
+
   ret void
 }
 
@@ -113,6 +113,18 @@ define void @test_inline_modifier_bhsdq() nounwind {
 ; CHECK: ldr s0, [sp]
 ; CHECK: ldr d0, [sp]
 ; CHECK: ldr q0, [sp]
+
+  call void asm sideeffect "fcmp b0, ${0:b}", "Yw"(float 0.0)
+  call void asm sideeffect "fcmp h0, ${0:h}", "Yw"(float 0.0)
+  call void asm sideeffect "fcmp s0, ${0:s}", "Yw"(float 0.0)
+  call void asm sideeffect "fcmp d0, ${0:d}", "Yw"(float 0.0)
+  call void asm sideeffect "fcmp q0, ${0:q}", "Yw"(float 0.0)
+; CHECK: fcmp b0, #0
+; CHECK: fcmp h0, #0
+; CHECK: fcmp s0, #0
+; CHECK: fcmp d0, #0
+; CHECK: fcmp q0, #0
+
   ret void
 }
 
@@ -123,3 +135,13 @@ define void @test_inline_modifier_c() nounwind {
 
   ret void
 }
+
+define void @test_inline_modifier_a() nounwind {
+; CHECK-LABEL: test_inline_modifier_a:
+  call void asm sideeffect "prfm pldl1keep, ${0:a}", "r"(i32* @var_simple)
+; CHECK: adrp [[VARHI:x[0-9]+]], var_simple
+; CHECK: add x[[VARADDR:[0-9]+]], [[VARHI]], #:lo12:var_simple
+; CHECK: prfm pldl1keep, [x[[VARADDR]]]
+  ret void
+}
+
diff --git a/test/CodeGen/AArch64/jump-table.ll b/test/CodeGen/AArch64/jump-table.ll
index 0f1e760..4bb0942 100644
--- a/test/CodeGen/AArch64/jump-table.ll
+++ b/test/CodeGen/AArch64/jump-table.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -code-model=large -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -filetype=obj | llvm-readobj -r | FileCheck %s -check-prefix=CHECK-ELF
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
@@ -48,19 +47,3 @@ lbl4:
 ; CHECK-NEXT: .xword
 ; CHECK-NEXT: .xword
 ; CHECK-NEXT: .xword
-
-; ELF tests:
-
-; First make sure we get a page/lo12 pair in .text to pick up the jump-table
-
-; CHECK-ELF:      Relocations [
-; CHECK-ELF:        Section ({{[0-9]+}}) .rela.text {
-; CHECK-ELF-NEXT:     0x{{[0-9,A-F]+}} R_AARCH64_ADR_PREL_PG_HI21 .rodata
-; CHECK-ELF-NEXT:     0x{{[0-9,A-F]+}} R_AARCH64_ADD_ABS_LO12_NC .rodata
-; CHECK-ELF:        }
-
-; Also check the targets in .rodata are relocated
-; CHECK-ELF:        Section ({{[0-9]+}}) .rela.rodata {
-; CHECK-ELF-NEXT:     0x{{[0-9,A-F]+}} R_AARCH64_ABS64 .text
-; CHECK-ELF:        }
-; CHECK-ELF:      ]
diff --git a/test/CodeGen/AArch64/ldst-regoffset.ll b/test/CodeGen/AArch64/ldst-regoffset.ll
index c83fb52..db30fd9 100644
--- a/test/CodeGen/AArch64/ldst-regoffset.ll
+++ b/test/CodeGen/AArch64/ldst-regoffset.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
@@ -197,11 +198,13 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
    %val_sxtwN = load volatile float* %addr_sxtwN
    store volatile float %val_sxtwN, float* @var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #2]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %addr_lslN = getelementptr float* %base, i64 %off64
   %val_lslN = load volatile float* %addr_lslN
   store volatile float %val_lslN, float* @var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #2]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %addrint_uxtw = ptrtoint float* %base to i64
   %offset_uxtw = zext i32 %off32 to i64
@@ -210,6 +213,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %val_uxtw = load volatile float* %addr_uxtw
   store volatile float %val_uxtw, float* @var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %base_sxtw = ptrtoint float* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -218,6 +222,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %val64_sxtw = load volatile float* %addr_sxtw
   store volatile float %val64_sxtw, float* @var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %base_lsl = ptrtoint float* %base to i64
   %addrint_lsl = add i64 %base_lsl, %off64
@@ -225,6 +230,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %val64_lsl = load volatile float* %addr_lsl
   store volatile float %val64_lsl, float* @var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   %base_uxtwN = ptrtoint float* %base to i64
   %offset_uxtwN = zext i32 %off32 to i64
@@ -234,6 +240,7 @@ define void @ldst_float(float* %base, i32 %off32, i64 %off64) {
   %val64 = load volatile float* @var_float
   store volatile float %val64, float* %addr_uxtwN
 ; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #2]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
    ret void
 }
 
@@ -244,11 +251,13 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
    %val_sxtwN = load volatile double* %addr_sxtwN
    store volatile double %val_sxtwN, double* @var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #3]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %addr_lslN = getelementptr double* %base, i64 %off64
   %val_lslN = load volatile double* %addr_lslN
   store volatile double %val_lslN, double* @var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #3]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %addrint_uxtw = ptrtoint double* %base to i64
   %offset_uxtw = zext i32 %off32 to i64
@@ -257,6 +266,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %val_uxtw = load volatile double* %addr_uxtw
   store volatile double %val_uxtw, double* @var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %base_sxtw = ptrtoint double* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -265,6 +275,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %val64_sxtw = load volatile double* %addr_sxtw
   store volatile double %val64_sxtw, double* @var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %base_lsl = ptrtoint double* %base to i64
   %addrint_lsl = add i64 %base_lsl, %off64
@@ -272,6 +283,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %val64_lsl = load volatile double* %addr_lsl
   store volatile double %val64_lsl, double* @var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   %base_uxtwN = ptrtoint double* %base to i64
   %offset_uxtwN = zext i32 %off32 to i64
@@ -281,6 +293,7 @@ define void @ldst_double(double* %base, i32 %off32, i64 %off64) {
   %val64 = load volatile double* @var_double
   store volatile double %val64, double* %addr_uxtwN
 ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #3]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
    ret void
 }
 
@@ -292,11 +305,13 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
    %val_sxtwN = load volatile fp128* %addr_sxtwN
    store volatile fp128 %val_sxtwN, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
 
   %addr_lslN = getelementptr fp128* %base, i64 %off64
   %val_lslN = load volatile fp128* %addr_lslN
   store volatile fp128 %val_lslN, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}, lsl #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
 
   %addrint_uxtw = ptrtoint fp128* %base to i64
   %offset_uxtw = zext i32 %off32 to i64
@@ -305,6 +320,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %val_uxtw = load volatile fp128* %addr_uxtw
   store volatile fp128 %val_uxtw, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
 
   %base_sxtw = ptrtoint fp128* %base to i64
   %offset_sxtw = sext i32 %off32 to i64
@@ -313,6 +329,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %val64_sxtw = load volatile fp128* %addr_sxtw
   store volatile fp128 %val64_sxtw, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
 
   %base_lsl = ptrtoint fp128* %base to i64
   %addrint_lsl = add i64 %base_lsl, %off64
@@ -320,6 +337,7 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %val64_lsl = load volatile fp128* %addr_lsl
   store volatile fp128 %val64_lsl, fp128* %base
 ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{x[0-9]+}}]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
 
   %base_uxtwN = ptrtoint fp128* %base to i64
   %offset_uxtwN = zext i32 %off32 to i64
@@ -329,5 +347,6 @@ define void @ldst_128bit(fp128* %base, i32 %off32, i64 %off64) {
   %val64 = load volatile fp128* %base
   store volatile fp128 %val64, fp128* %addr_uxtwN
 ; CHECK: str {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, uxtw #4]
+; CHECK-NOFP-NOT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, {{w[0-9]+}}, sxtw #4]
    ret void
 }
diff --git a/test/CodeGen/AArch64/ldst-unscaledimm.ll b/test/CodeGen/AArch64/ldst-unscaledimm.ll
index 03dedcc..bea5bb5 100644
--- a/test/CodeGen/AArch64/ldst-unscaledimm.ll
+++ b/test/CodeGen/AArch64/ldst-unscaledimm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
@@ -194,9 +195,11 @@ define void @ldst_float() {
 
   %valfp = load volatile float* %addrfp
 ; CHECK: ldur {{s[0-9]+}}, [{{x[0-9]+}}, #-5]
+; CHECK-NOFP-NOT: ldur {{s[0-9]+}},
 
   store volatile float %valfp, float* %addrfp
 ; CHECK: stur {{s[0-9]+}}, [{{x[0-9]+}}, #-5]
+; CHECK-NOFP-NOT: stur {{s[0-9]+}},
 
   ret void
 }
@@ -210,9 +213,11 @@ define void @ldst_double() {
 
   %valfp = load volatile double* %addrfp
 ; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #4]
+; CHECK-NOFP-NOT: ldur {{d[0-9]+}},
 
   store volatile double %valfp, double* %addrfp
 ; CHECK: stur {{d[0-9]+}}, [{{x[0-9]+}}, #4]
+; CHECK-NOFP-NOT: stur {{d[0-9]+}},
 
    ret void
 }
diff --git a/test/CodeGen/AArch64/ldst-unsignedimm.ll b/test/CodeGen/AArch64/ldst-unsignedimm.ll
index 77cef4e..44c1586 100644
--- a/test/CodeGen/AArch64/ldst-unsignedimm.ll
+++ b/test/CodeGen/AArch64/ldst-unsignedimm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
@@ -230,9 +231,11 @@ define void @ldst_float() {
    %valfp = load volatile float* @var_float
 ; CHECK: adrp {{x[0-9]+}}, var_float
 ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
   store volatile float %valfp, float* @var_float
 ; CHECK: str {{s[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_float]
+; CHECK-NOFP-NOT: str {{s[0-9]+}},
 
    ret void
 }
@@ -243,9 +246,11 @@ define void @ldst_double() {
    %valfp = load volatile double* @var_double
 ; CHECK: adrp {{x[0-9]+}}, var_double
 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
 
   store volatile double %valfp, double* @var_double
 ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:var_double]
+; CHECK-NOFP-NOT: str {{d[0-9]+}},
 
    ret void
 }
diff --git a/test/CodeGen/AArch64/lit.local.cfg b/test/CodeGen/AArch64/lit.local.cfg
index c5ce241..9a66a00 100644
--- a/test/CodeGen/AArch64/lit.local.cfg
+++ b/test/CodeGen/AArch64/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'AArch64' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/AArch64/literal_pools.ll b/test/CodeGen/AArch64/literal_pools.ll
index b82f290..fc33aee 100644
--- a/test/CodeGen/AArch64/literal_pools.ll
+++ b/test/CodeGen/AArch64/literal_pools.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -65,8 +67,8 @@ define void @floating_lits() {
   %floatval = load float* @varfloat
   %newfloat = fadd float %floatval, 128.0
 ; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
-; CHECK: ldr {{s[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-; CHECK: fadd
+; CHECK: ldr [[LIT128:s[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]]
+; CHECK-NOFP-NOT: ldr {{s[0-9]+}},
 
 ; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]]
 ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
@@ -74,20 +76,26 @@ define void @floating_lits() {
 ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
 ; CHECK-LARGE: ldr {{s[0-9]+}}, [x[[LITADDR]]]
 ; CHECK-LARGE: fadd
+; CHECK-NOFP-LARGE-NOT: ldr {{s[0-9]+}},
+; CHECK-NOFP-LARGE-NOT: fadd
 
   store float %newfloat, float* @varfloat
 
   %doubleval = load double* @vardouble
   %newdouble = fadd double %doubleval, 129.0
 ; CHECK: adrp x[[LITBASE:[0-9]+]], [[CURLIT:.LCPI1_[0-9]+]]
-; CHECK: ldr {{d[0-9]+}}, [x[[LITBASE]], #:lo12:[[CURLIT]]]
-; CHECK: fadd
+; CHECK: ldr [[LIT129:d[0-9]+]], [x[[LITBASE]], #:lo12:[[CURLIT]]]
+; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, [[LIT128]]
+; CHECK: fadd {{d[0-9]+}}, {{d[0-9]+}}, [[LIT129]]
+; CHECK-NOFP-NOT: ldr {{d[0-9]+}},
+; CHECK-NOFP-NOT: fadd
 
 ; CHECK-LARGE: movz x[[LITADDR:[0-9]+]], #:abs_g3:[[CURLIT:.LCPI1_[0-9]+]]
 ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g2_nc:[[CURLIT]]
 ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g1_nc:[[CURLIT]]
 ; CHECK-LARGE: movk x[[LITADDR]], #:abs_g0_nc:[[CURLIT]]
 ; CHECK-LARGE: ldr {{d[0-9]+}}, [x[[LITADDR]]]
+; CHECK-NOFP-LARGE-NOT: ldr {{d[0-9]+}},
 
   store double %newdouble, double* @vardouble
 
diff --git a/test/CodeGen/AArch64/neon-2velem-high.ll b/test/CodeGen/AArch64/neon-2velem-high.ll
new file mode 100644
index 0000000..97031d9
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-2velem-high.ll
@@ -0,0 +1,331 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+
+define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) {
+; CHECK: test_vmull_high_n_s16:
+; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vmull15.i.i
+}
+
+define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK: test_vmull_high_n_s32:
+; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vmull9.i.i
+}
+
+define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) {
+; CHECK: test_vmull_high_n_u16:
+; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vmull15.i.i
+}
+
+define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) {
+; CHECK: test_vmull_high_n_u32:
+; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vmull9.i.i
+}
+
+define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) {
+; CHECK: test_vqdmull_high_n_s16:
+; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
+  %vqdmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  ret <4 x i32> %vqdmull15.i.i
+}
+
+define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) {
+; CHECK: test_vqdmull_high_n_s32:
+; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
+  %vqdmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  ret <2 x i64> %vqdmull9.i.i
+}
+
+define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vmlal_high_n_s16:
+; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vmlal_high_n_s32:
+; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vmlal_high_n_u16:
+; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vmlal_high_n_u32:
+; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vqdmlal_high_n_s16:
+; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
+  ret <4 x i32> %vqdmlal17.i.i
+}
+
+define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vqdmlal_high_n_s32:
+; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
+  ret <2 x i64> %vqdmlal11.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vmlsl_high_n_s16:
+; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vmlsl_high_n_s32:
+; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vmlsl_high_n_u16:
+; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vmlsl_high_n_u32:
+; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) {
+; CHECK: test_vqdmlsl_high_n_s16:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0
+  %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
+  %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
+  %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
+  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
+  ret <4 x i32> %vqdmlsl17.i.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) {
+; CHECK: test_vqdmlsl_high_n_s32:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
+  %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
+  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
+  ret <2 x i64> %vqdmlsl11.i.i
+}
+
+define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) {
+; CHECK: test_vmul_n_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %b, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1
+  %mul.i = fmul <2 x float> %vecinit1.i, %a
+  ret <2 x float> %mul.i
+}
+
+define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) {
+; CHECK: test_vmulq_n_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %b, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3
+  %mul.i = fmul <4 x float> %vecinit3.i, %a
+  ret <4 x float> %mul.i
+}
+
+define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) {
+; CHECK: test_vmulq_n_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %vecinit.i = insertelement <2 x double> undef, double %b, i32 0
+  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1
+  %mul.i = fmul <2 x double> %vecinit1.i, %a
+  ret <2 x double> %mul.i
+}
+
+define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
+; CHECK: test_vfma_n_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
+; CHECK: test_vfmaq_n_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) {
+; CHECK: test_vfms_n_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <2 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1
+  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+  %1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a)
+  ret <2 x float> %1
+}
+
+define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) {
+; CHECK: test_vfmsq_n_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}]
+entry:
+  %vecinit.i = insertelement <4 x float> undef, float %n, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3
+  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+  %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a)
+  ret <4 x float> %1
+}
diff --git a/test/CodeGen/AArch64/neon-2velem.ll b/test/CodeGen/AArch64/neon-2velem.ll
new file mode 100644
index 0000000..9d61842
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-2velem.ll
@@ -0,0 +1,2550 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
+
+declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
+
+declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
+
+declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+
+declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+
+declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
+
+declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
+
+declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+
+declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmla_lane_s16:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlaq_lane_s16:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmla_lane_s32:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlaq_lane_s32:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmla_laneq_s16:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlaq_laneq_s16:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmla_laneq_s32:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlaq_laneq_s32:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmls_lane_s16:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsq_lane_s16:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmls_lane_s32:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsq_lane_s32:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmls_laneq_s16:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsq_laneq_s16:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmls_laneq_s32:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsq_laneq_s32:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmul_lane_s16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmulq_lane_s16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmul_lane_s32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmulq_lane_s32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmul_lane_u16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmulq_lane_u16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmul_lane_u32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmulq_lane_u32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmul_laneq_s16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmulq_laneq_s16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmul_laneq_s32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmulq_laneq_s32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmul_laneq_u16:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmulq_laneq_u16:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmul_laneq_u32:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmulq_laneq_u32:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK: test_vfma_lane_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+
+define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK: test_vfmaq_lane_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK: test_vfma_laneq_f32:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK: test_vfmaq_laneq_f32:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK: test_vfms_lane_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK: test_vfmsq_lane_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK: test_vfms_laneq_f32:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK: test_vfmsq_laneq_f32:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
+; CHECK: test_vfmaq_lane_f64:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK: test_vfmaq_laneq_f64:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+entry:
+  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {
+; CHECK: test_vfmsq_lane_f64:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %sub = fsub <1 x double> <double -0.000000e+00>, %v
+  %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK: test_vfmsq_laneq_f64:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+entry:
+  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_lane_s16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_lane_s32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_laneq_s16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_laneq_s32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_high_lane_s16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_high_lane_s32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_high_laneq_s16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_high_laneq_s32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_lane_s16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_lane_s32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_laneq_s16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_laneq_s32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_high_lane_s16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_high_lane_s32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_high_laneq_s16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_high_laneq_s32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_lane_u16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_lane_u32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_laneq_u16:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_laneq_u32:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_high_lane_u16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_high_lane_u32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_high_laneq_u16:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_high_laneq_u32:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_lane_u16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_lane_u32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_laneq_u16:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_laneq_u32:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_high_lane_u16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_high_lane_u32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_high_laneq_u16:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_high_laneq_u32:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_lane_s16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_lane_s32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_lane_u16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_lane_u32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_high_lane_s16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_high_lane_s32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_high_lane_u16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_high_lane_u32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_laneq_s16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_laneq_s32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_laneq_u16:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_laneq_u32:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_high_laneq_s16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_high_laneq_s32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_high_laneq_u16:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_high_laneq_u32:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlal_lane_s16:
+; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlal_lane_s32:
+; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlal_high_lane_s16:
+; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlal_high_lane_s32:
+; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlsl_lane_s16:
+; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlsl_lane_s32:
+; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlsl_high_lane_s16:
+; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlsl_high_lane_s32:
+; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmull_lane_s16:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmull_lane_s32:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vqdmull_laneq_s16:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vqdmull_laneq_s32:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmull_high_lane_s16:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmull_high_lane_s32:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vqdmull_high_laneq_s16:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vqdmull_high_laneq_s32:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmulh_lane_s16:
+; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmulhq_lane_s16:
+; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmulh_lane_s32:
+; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmulhq_lane_s32:
+; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqrdmulh_lane_s16:
+; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqrdmulhq_lane_s16:
+; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqrdmulh_lane_s32:
+; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqrdmulhq_lane_s32:
+; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {
+; CHECK: test_vmul_lane_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {
+; CHECK: test_vmul_lane_f64:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <1 x double> %v, i32 0
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulq_lane_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {
+; CHECK: test_vmulq_lane_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
+; CHECK: test_vmul_laneq_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {
+; CHECK: test_vmul_laneq_f64:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <2 x double> %v, i32 1
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulq_laneq_f32:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {
+; CHECK: test_vmulq_laneq_f64:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulx_lane_f32:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulxq_lane_f32:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
+; CHECK: test_vmulxq_lane_f64:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulx_laneq_f32:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulxq_laneq_f32:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
+; CHECK: test_vmulxq_laneq_f64:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmla_lane_s16_0:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlaq_lane_s16_0:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmla_lane_s32_0:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlaq_lane_s32_0:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmla_laneq_s16_0:
+; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlaq_laneq_s16_0:
+; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %add = add <8 x i16> %mul, %a
+  ret <8 x i16> %add
+}
+
+define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmla_laneq_s32_0:
+; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlaq_laneq_s32_0:
+; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %add = add <4 x i32> %mul, %a
+  ret <4 x i32> %add
+}
+
+define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmls_lane_s16_0:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsq_lane_s16_0:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmls_lane_s32_0:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsq_lane_s32_0:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmls_laneq_s16_0:
+; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
+
+define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsq_laneq_s16_0:
+; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %b
+  %sub = sub <8 x i16> %a, %mul
+  ret <8 x i16> %sub
+}
+
+define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmls_laneq_s32_0:
+; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %b
+  %sub = sub <2 x i32> %a, %mul
+  ret <2 x i32> %sub
+}
+
+define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsq_laneq_s32_0:
+; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %b
+  %sub = sub <4 x i32> %a, %mul
+  ret <4 x i32> %sub
+}
+
+define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmul_lane_s16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmulq_lane_s16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmul_lane_s32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmulq_lane_s32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmul_lane_u16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmulq_lane_u16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmul_lane_u32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmulq_lane_u32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmul_laneq_s16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmulq_laneq_s16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmul_laneq_s32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmulq_laneq_s32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmul_laneq_u16_0:
+; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i16> %shuffle, %a
+  ret <4 x i16> %mul
+}
+
+define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmulq_laneq_u16_0:
+; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+  %mul = mul <8 x i16> %shuffle, %a
+  ret <8 x i16> %mul
+}
+
+define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmul_laneq_u32_0:
+; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %mul = mul <2 x i32> %shuffle, %a
+  ret <2 x i32> %mul
+}
+
+define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmulq_laneq_u32_0:
+; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %mul = mul <4 x i32> %shuffle, %a
+  ret <4 x i32> %mul
+}
+
+define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK: test_vfma_lane_f32_0:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK: test_vfmaq_lane_f32_0:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK: test_vfma_laneq_f32_0:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK: test_vfmaq_laneq_f32_0:
+; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK: test_vfms_lane_f32_0:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {
+; CHECK: test_vfmsq_lane_f32_0:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
+; CHECK: test_vfms_laneq_f32_0:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
+  ret <2 x float> %0
+}
+
+define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {
+; CHECK: test_vfmsq_laneq_f32_0:
+; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a)
+  ret <4 x float> %0
+}
+
+define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK: test_vfmaq_laneq_f64_0:
+; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {
+; CHECK: test_vfmsq_laneq_f64_0:
+; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v
+  %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer
+  %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a)
+  ret <2 x double> %0
+}
+
+define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_lane_s16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_lane_s32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_laneq_s16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_laneq_s32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_high_lane_s16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_high_lane_s32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_high_laneq_s16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_high_laneq_s32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_lane_s16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_lane_s32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_laneq_s16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_laneq_s32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_high_lane_s16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_high_lane_s32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_high_laneq_s16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_high_laneq_s32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_lane_u16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_lane_u32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_laneq_u16_0:
+; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_laneq_u32_0:
+; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlal_high_lane_u16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlal_high_lane_u32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlal_high_laneq_u16_0:
+; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %add = add <4 x i32> %vmull2.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlal_high_laneq_u32_0:
+; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %add = add <2 x i64> %vmull2.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_lane_u16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_lane_u32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_laneq_u16_0:
+; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_laneq_u32_0:
+; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vmlsl_high_lane_u16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vmlsl_high_lane_u32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {
+; CHECK: test_vmlsl_high_laneq_u16_0:
+; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %sub = sub <4 x i32> %a, %vmull2.i
+  ret <4 x i32> %sub
+}
+
+define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {
+; CHECK: test_vmlsl_high_laneq_u32_0:
+; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %sub = sub <2 x i64> %a, %vmull2.i
+  ret <2 x i64> %sub
+}
+
+define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_lane_s16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_lane_s32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_lane_u16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_lane_u32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_high_lane_s16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_high_lane_s32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vmull_high_lane_u16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vmull_high_lane_u32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_laneq_s16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_laneq_s32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_laneq_u16_0:
+; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_laneq_u32_0:
+; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_high_laneq_s16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_high_laneq_s32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vmull_high_laneq_u16_0:
+; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vmull_high_laneq_u32_0:
+; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlal_lane_s16_0:
+; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlal_lane_s32_0:
+; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlal_high_lane_s16_0:
+; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlal_high_lane_s32_0:
+; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlsl_lane_s16_0:
+; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlsl_lane_s32_0:
+; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {
+; CHECK: test_vqdmlsl_high_lane_s16_0:
+; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {
+; CHECK: test_vqdmlsl_high_lane_s32_0:
+; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmull_lane_s16_0:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmull_lane_s32_0:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vqdmull_laneq_s16_0:
+; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vqdmull_laneq_s32_0:
+; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmull_high_lane_s16_0:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmull_high_lane_s32_0:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {
+; CHECK: test_vqdmull_high_laneq_s16_0:
+; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {
+; CHECK: test_vqdmull_high_laneq_s32_0:
+; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmulh_lane_s16_0:
+; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqdmulh2.i
+}
+
+define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqdmulhq_lane_s16_0:
+; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqdmulh2.i
+}
+
+define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmulh_lane_s32_0:
+; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqdmulh2.i
+}
+
+define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqdmulhq_lane_s32_0:
+; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %vqdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqdmulh2.i
+}
+
+define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqrdmulh_lane_s16_0:
+; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  ret <4 x i16> %vqrdmulh2.i
+}
+
+define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {
+; CHECK: test_vqrdmulhq_lane_s16_0:
+; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  ret <8 x i16> %vqrdmulh2.i
+}
+
+define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqrdmulh_lane_s32_0:
+; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  ret <2 x i32> %vqrdmulh2.i
+}
+
+define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {
+; CHECK: test_vqrdmulhq_lane_s32_0:
+; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  ret <4 x i32> %vqrdmulh2.i
+}
+
+define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {
+; CHECK: test_vmul_lane_f32_0:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulq_lane_f32_0:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
+; CHECK: test_vmul_laneq_f32_0:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x float> %shuffle, %a
+  ret <2 x float> %mul
+}
+
+define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
+; CHECK: test_vmul_laneq_f64_0:
+; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
+entry:
+  %0 = bitcast <1 x double> %a to <8 x i8>
+  %1 = bitcast <8 x i8> %0 to double
+  %extract = extractelement <2 x double> %v, i32 0
+  %2 = fmul double %1, %extract
+  %3 = insertelement <1 x double> undef, double %2, i32 0
+  ret <1 x double> %3
+}
+
+define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulq_laneq_f32_0:
+; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %mul = fmul <4 x float> %shuffle, %a
+  ret <4 x float> %mul
+}
+
+define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
+; CHECK: test_vmulq_laneq_f64_0:
+; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %mul = fmul <2 x double> %shuffle, %a
+  ret <2 x double> %mul
+}
+
+define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulx_lane_f32_0:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {
+; CHECK: test_vmulxq_lane_f32_0:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {
+; CHECK: test_vmulxq_lane_f64_0:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
+define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulx_laneq_f32_0:
+; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  ret <2 x float> %vmulx2.i
+}
+
+define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {
+; CHECK: test_vmulxq_laneq_f32_0:
+; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
+entry:
+  %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  ret <4 x float> %vmulx2.i
+}
+
+define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
+; CHECK: test_vmulxq_laneq_f64_0:
+; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+entry:
+  %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  ret <2 x double> %vmulx2.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-3vdiff.ll b/test/CodeGen/AArch64/neon-3vdiff.ll
new file mode 100644
index 0000000..171e2b2
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-3vdiff.ll
@@ -0,0 +1,1806 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>)
+
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>)
+
+declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>)
+
+declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>)
+
+declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
+
+declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
+
+declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
+
+declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
+
+declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
+
+declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>)
+
+declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>)
+
+declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>)
+
+declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>)
+
+declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>)
+
+declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vaddl_s8:
+; CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vaddl_s16:
+; CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vaddl_s32:
+; CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vaddl_u8:
+; CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vaddl_u16:
+; CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vaddl_u32:
+; CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vaddl_high_s8:
+; CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %1
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddl_high_s16:
+; CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %1
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddl_high_s32:
+; CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %1
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vaddl_high_u8:
+; CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %1
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddl_high_u16:
+; CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %1
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddl_high_u32:
+; CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %1
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK: test_vaddw_s8:
+; CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK: test_vaddw_s16:
+; CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK: test_vaddw_s32:
+; CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK: test_vaddw_u8:
+; CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK: test_vaddw_u16:
+; CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK: test_vaddw_u32:
+; CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK: test_vaddw_high_s8:
+; CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK: test_vaddw_high_s16:
+; CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK: test_vaddw_high_s32:
+; CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK: test_vaddw_high_u8:
+; CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %0, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK: test_vaddw_high_u16:
+; CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %0, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK: test_vaddw_high_u32:
+; CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %0, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsubl_s8:
+; CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = sext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsubl_s16:
+; CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = sext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsubl_s32:
+; CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = sext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsubl_u8:
+; CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %a to <8 x i16>
+  %vmovl.i2.i = zext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i2.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsubl_u16:
+; CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %a to <4 x i32>
+  %vmovl.i2.i = zext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %vmovl.i.i, %vmovl.i2.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsubl_u32:
+; CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %a to <2 x i64>
+  %vmovl.i2.i = zext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %vmovl.i.i, %vmovl.i2.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsubl_high_s8:
+; CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = sext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %sub.i = sub <8 x i16> %0, %1
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubl_high_s16:
+; CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = sext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %sub.i = sub <4 x i32> %0, %1
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubl_high_s32:
+; CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = sext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %sub.i = sub <2 x i64> %0, %1
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsubl_high_u8:
+; CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %shuffle.i.i2.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = zext <8 x i8> %shuffle.i.i2.i to <8 x i16>
+  %sub.i = sub <8 x i16> %0, %1
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubl_high_u16:
+; CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %shuffle.i.i2.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = zext <4 x i16> %shuffle.i.i2.i to <4 x i32>
+  %sub.i = sub <4 x i32> %0, %1
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubl_high_u32:
+; CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %shuffle.i.i2.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = zext <2 x i32> %shuffle.i.i2.i to <2 x i64>
+  %sub.i = sub <2 x i64> %0, %1
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK: test_vsubw_s8:
+; CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = sext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %vmovl.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK: test_vsubw_s16:
+; CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = sext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %vmovl.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK: test_vsubw_s32:
+; CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = sext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %vmovl.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) {
+; CHECK: test_vsubw_u8:
+; CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
+entry:
+  %vmovl.i.i = zext <8 x i8> %b to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %vmovl.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) {
+; CHECK: test_vsubw_u16:
+; CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
+entry:
+  %vmovl.i.i = zext <4 x i16> %b to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %vmovl.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) {
+; CHECK: test_vsubw_u32:
+; CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
+entry:
+  %vmovl.i.i = zext <2 x i32> %b to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %vmovl.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK: test_vsubw_high_s8:
+; CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = sext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %0
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK: test_vsubw_high_s16:
+; CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = sext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %0
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK: test_vsubw_high_s32:
+; CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = sext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %0
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) {
+; CHECK: test_vsubw_high_u8:
+; CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %0 = zext <8 x i8> %shuffle.i.i.i to <8 x i16>
+  %sub.i = sub <8 x i16> %a, %0
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) {
+; CHECK: test_vsubw_high_u16:
+; CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %0 = zext <4 x i16> %shuffle.i.i.i to <4 x i32>
+  %sub.i = sub <4 x i32> %a, %0
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) {
+; CHECK: test_vsubw_high_u32:
+; CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %0 = zext <2 x i32> %shuffle.i.i.i to <2 x i64>
+  %sub.i = sub <2 x i64> %a, %0
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddhn_s16:
+; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i = add <8 x i16> %a, %b
+  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
+  ret <8 x i8> %vaddhn2.i
+}
+
+define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddhn_s32:
+; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i = add <4 x i32> %a, %b
+  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
+  ret <4 x i16> %vaddhn2.i
+}
+
+define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vaddhn_s64:
+; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i = add <2 x i64> %a, %b
+  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
+  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
+  ret <2 x i32> %vaddhn2.i
+}
+
+define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddhn_u16:
+; CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i = add <8 x i16> %a, %b
+  %vaddhn1.i = lshr <8 x i16> %vaddhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i = trunc <8 x i16> %vaddhn1.i to <8 x i8>
+  ret <8 x i8> %vaddhn2.i
+}
+
+define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddhn_u32:
+; CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i = add <4 x i32> %a, %b
+  %vaddhn1.i = lshr <4 x i32> %vaddhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i = trunc <4 x i32> %vaddhn1.i to <4 x i16>
+  ret <4 x i16> %vaddhn2.i
+}
+
+define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vaddhn_u64:
+; CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i = add <2 x i64> %a, %b
+  %vaddhn1.i = lshr <2 x i64> %vaddhn.i, <i64 32, i64 32>
+  %vaddhn2.i = trunc <2 x i64> %vaddhn1.i to <2 x i32>
+  ret <2 x i32> %vaddhn2.i
+}
+
+define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddhn_high_s16:
+; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i.i = add <8 x i16> %a, %b
+  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddhn_high_s32:
+; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i.i = add <4 x i32> %a, %b
+  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vaddhn_high_s64:
+; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i.i = add <2 x i64> %a, %b
+  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
+  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vaddhn_high_u16:
+; CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vaddhn.i.i = add <8 x i16> %a, %b
+  %vaddhn1.i.i = lshr <8 x i16> %vaddhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vaddhn2.i.i = trunc <8 x i16> %vaddhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vaddhn_high_u32:
+; CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vaddhn.i.i = add <4 x i32> %a, %b
+  %vaddhn1.i.i = lshr <4 x i32> %vaddhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vaddhn2.i.i = trunc <4 x i32> %vaddhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vaddhn_high_u64:
+; CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vaddhn.i.i = add <2 x i64> %a, %b
+  %vaddhn1.i.i = lshr <2 x i64> %vaddhn.i.i, <i64 32, i64 32>
+  %vaddhn2.i.i = trunc <2 x i64> %vaddhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vaddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vraddhn_s16:
+; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vraddhn2.i
+}
+
+define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vraddhn_s32:
+; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vraddhn2.i
+}
+
+define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vraddhn_s64:
+; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vraddhn2.i
+}
+
+define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vraddhn_u16:
+; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vraddhn2.i
+}
+
+define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vraddhn_u32:
+; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vraddhn2.i
+}
+
+define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vraddhn_u64:
+; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vraddhn2.i
+}
+
+define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vraddhn_high_s16:
+; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vraddhn_high_s32:
+; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vraddhn_high_s64:
+; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vraddhn_high_u16:
+; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vraddhn_high_u32:
+; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vraddhn_high_u64:
+; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubhn_s16:
+; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i = sub <8 x i16> %a, %b
+  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
+  ret <8 x i8> %vsubhn2.i
+}
+
+define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubhn_s32:
+; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i = sub <4 x i32> %a, %b
+  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
+  ret <4 x i16> %vsubhn2.i
+}
+
+define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsubhn_s64:
+; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i = sub <2 x i64> %a, %b
+  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
+  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
+  ret <2 x i32> %vsubhn2.i
+}
+
+define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubhn_u16:
+; CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i = sub <8 x i16> %a, %b
+  %vsubhn1.i = lshr <8 x i16> %vsubhn.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i = trunc <8 x i16> %vsubhn1.i to <8 x i8>
+  ret <8 x i8> %vsubhn2.i
+}
+
+define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubhn_u32:
+; CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i = sub <4 x i32> %a, %b
+  %vsubhn1.i = lshr <4 x i32> %vsubhn.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i = trunc <4 x i32> %vsubhn1.i to <4 x i16>
+  ret <4 x i16> %vsubhn2.i
+}
+
+define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsubhn_u64:
+; CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i = sub <2 x i64> %a, %b
+  %vsubhn1.i = lshr <2 x i64> %vsubhn.i, <i64 32, i64 32>
+  %vsubhn2.i = trunc <2 x i64> %vsubhn1.i to <2 x i32>
+  ret <2 x i32> %vsubhn2.i
+}
+
+define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubhn_high_s16:
+; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i.i = sub <8 x i16> %a, %b
+  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubhn_high_s32:
+; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i.i = sub <4 x i32> %a, %b
+  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsubhn_high_s64:
+; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i.i = sub <2 x i64> %a, %b
+  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
+  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsubhn_high_u16:
+; CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vsubhn.i.i = sub <8 x i16> %a, %b
+  %vsubhn1.i.i = lshr <8 x i16> %vsubhn.i.i, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %vsubhn2.i.i = trunc <8 x i16> %vsubhn1.i.i to <8 x i8>
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsubhn_high_u32:
+; CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vsubhn.i.i = sub <4 x i32> %a, %b
+  %vsubhn1.i.i = lshr <4 x i32> %vsubhn.i.i, <i32 16, i32 16, i32 16, i32 16>
+  %vsubhn2.i.i = trunc <4 x i32> %vsubhn1.i.i to <4 x i16>
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsubhn_high_u64:
+; CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vsubhn.i.i = sub <2 x i64> %a, %b
+  %vsubhn1.i.i = lshr <2 x i64> %vsubhn.i.i, <i64 32, i64 32>
+  %vsubhn2.i.i = trunc <2 x i64> %vsubhn1.i.i to <2 x i32>
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsubhn_s16:
+; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vrsubhn2.i
+}
+
+define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsubhn_s32:
+; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vrsubhn2.i
+}
+
+define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsubhn_s64:
+; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vrsubhn2.i
+}
+
+define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsubhn_u16:
+; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i8> %vrsubhn2.i
+}
+
+define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsubhn_u32:
+; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i16> %vrsubhn2.i
+}
+
+define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsubhn_u64:
+; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i32> %vrsubhn2.i
+}
+
+define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsubhn_high_s16:
+; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsubhn_high_s32:
+; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsubhn_high_s64:
+; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsubhn_high_u16:
+; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %0 = bitcast <8 x i8> %r to <1 x i64>
+  %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <16 x i8>
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsubhn_high_u32:
+; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %0 = bitcast <4 x i16> %r to <1 x i64>
+  %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsubhn_high_u64:
+; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+entry:
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %0 = bitcast <2 x i32> %r to <1 x i64>
+  %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
+  %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i.i to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vabdl_s8:
+; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i
+}
+
+define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vabdl_s16:
+; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i
+}
+
+define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vabdl_s32:
+; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i
+}
+
+define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vabdl_u8:
+; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i
+}
+
+define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vabdl_u16:
+; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i
+}
+
+define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vabdl_u32:
+; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i
+}
+
+define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vabal_s8:
+; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vabal_s16:
+; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vabal_s32:
+; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vabal_u8:
+; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vabal_u16:
+; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vabal_u32:
+; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vabdl_high_s8:
+; CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i.i
+}
+
+define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vabdl_high_s16:
+; CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i.i
+}
+
+define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vabdl_high_s32:
+; CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i.i
+}
+
+define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vabdl_high_u8:
+; CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
+  ret <8 x i16> %vmovl.i.i.i
+}
+
+define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vabdl_high_u16:
+; CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
+  ret <4 x i32> %vmovl.i.i.i
+}
+
+define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vabdl_high_u32:
+; CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
+  ret <2 x i64> %vmovl.i.i.i
+}
+
+define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vabal_high_s8:
+; CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
+  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vabal_high_s16:
+; CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
+  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vabal_high_s32:
+; CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
+  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vabal_high_u8:
+; CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
+  %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vabal_high_u16:
+; CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
+  %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vabal_high_u32:
+; CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
+  %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vmull_s8:
+; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vmull_s16:
+; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vmull_s32:
+; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vmull_u8:
+; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vmull_u16:
+; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vmull_u32:
+; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vmull_high_s8:
+; CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vmull_high_s16:
+; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vmull_high_s32:
+; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vmull_high_u8:
+; CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vmull_high_u16:
+; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vmull_high_u32:
+; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vmlal_s8:
+; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %add.i = add <8 x i16> %vmull.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vmlal_s16:
+; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %add.i = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vmlal_s32:
+; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %add.i = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vmlal_u8:
+; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %add.i = add <8 x i16> %vmull.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vmlal_u16:
+; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %add.i = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vmlal_u32:
+; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %add.i = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vmlal_high_s8:
+; CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vmlal_high_s16:
+; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vmlal_high_s32:
+; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vmlal_high_u8:
+; CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %add.i.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i.i
+}
+
+define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vmlal_high_u16:
+; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i.i
+}
+
+define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vmlal_high_u32:
+; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i.i
+}
+
+define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vmlsl_s8:
+; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %sub.i = sub <8 x i16> %a, %vmull.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vmlsl_s16:
+; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vmlsl_s32:
+; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vmlsl_u8:
+; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %sub.i = sub <8 x i16> %a, %vmull.i.i
+  ret <8 x i16> %sub.i
+}
+
+define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vmlsl_u16:
+; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %sub.i = sub <4 x i32> %a, %vmull2.i.i
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vmlsl_u32:
+; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %sub.i = sub <2 x i64> %a, %vmull2.i.i
+  ret <2 x i64> %sub.i
+}
+
+define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vmlsl_high_s8:
+; CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
+  ret <8 x i16> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vmlsl_high_s16:
+; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vmlsl_high_s32:
+; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vmlsl_high_u8:
+; CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
+  ret <8 x i16> %sub.i.i
+}
+
+define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vmlsl_high_u16:
+; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
+  ret <4 x i32> %sub.i.i
+}
+
+define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vmlsl_high_u32:
+; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
+  ret <2 x i64> %sub.i.i
+}
+
+define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vqdmull_s16:
+; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmull2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i32> %vqdmull2.i
+}
+
+define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vqdmull_s32:
+; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmull2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i64> %vqdmull2.i
+}
+
+define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vqdmlal_s16:
+; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmlal2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  ret <4 x i32> %vqdmlal4.i
+}
+
+define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vqdmlal_s32:
+; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmlal2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  ret <2 x i64> %vqdmlal4.i
+}
+
+define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) {
+; CHECK: test_vqdmlsl_s16:
+; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  ret <4 x i32> %vqdmlsl4.i
+}
+
+define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK: test_vqdmlsl_s32:
+; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  ret <2 x i64> %vqdmlsl4.i
+}
+
+define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vqdmull_high_s16:
+; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmull2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  ret <4 x i32> %vqdmull2.i.i
+}
+
+define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vqdmull_high_s32:
+; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmull2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  ret <2 x i64> %vqdmull2.i.i
+}
+
+define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vqdmlal_high_s16:
+; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmlal2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
+  ret <4 x i32> %vqdmlal4.i.i
+}
+
+define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vqdmlal_high_s32:
+; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmlal2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
+  ret <2 x i64> %vqdmlal4.i.i
+}
+
+define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) {
+; CHECK: test_vqdmlsl_high_s16:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
+  ret <4 x i32> %vqdmlsl4.i.i
+}
+
+define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) {
+; CHECK: test_vqdmlsl_high_s32:
+; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
+  ret <2 x i64> %vqdmlsl4.i.i
+}
+
+define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vmull_p8:
+; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i16> %vmull.i
+}
+
+define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vmull_high_p8:
+; CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmull.i.i = tail call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  ret <8 x i16> %vmull.i.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-aba-abd.ll b/test/CodeGen/AArch64/neon-aba-abd.ll
index b423666..5400984 100644
--- a/test/CodeGen/AArch64/neon-aba-abd.ll
+++ b/test/CodeGen/AArch64/neon-aba-abd.ll
@@ -157,6 +157,16 @@ define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
   ret <2 x i32> %abd
 }
 
+define <2 x i32> @test_sabd_v2i32_const() {
+; CHECK: test_sabd_v2i32_const:
+; CHECK: movi     d1, #0xffffffff0000
+; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
+  %1 = tail call <2 x i32> @llvm.arm.neon.vabds.v2i32(
+    <2 x i32> <i32 -2147483648, i32 2147450880>,
+    <2 x i32> <i32 -65536, i32 65535>)
+  ret <2 x i32> %1
+}
+
 define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK: test_saba_v2i32:
   %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
@@ -223,4 +233,4 @@ define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
   %abd = call <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double> %lhs, <2 x double> %rhs)
 ; CHECK: fabd v0.2d, v0.2d, v1.2d
   ret <2 x double> %abd
-}
-\ No newline at end of file
+}
diff --git a/test/CodeGen/AArch64/neon-across.ll b/test/CodeGen/AArch64/neon-across.ll
new file mode 100644
index 0000000..733db97
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-across.ll
@@ -0,0 +1,476 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v4f32(<4 x float>)
+
+declare <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v4f32(<4 x float>)
+
+declare <1 x float> @llvm.aarch64.neon.vminv.v1f32.v4f32(<4 x float>)
+
+declare <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v4f32(<4 x float>)
+
+declare <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32>)
+
+declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8>)
+
+declare <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32>)
+
+declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32>)
+
+declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8>)
+
+declare <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8>)
+
+declare <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32>)
+
+declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32>)
+
+declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8>)
+
+declare <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8>)
+
+declare <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16>)
+
+declare <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8>)
+
+declare <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32>)
+
+declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16>)
+
+declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8>)
+
+declare <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32>)
+
+declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16>)
+
+declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16>)
+
+declare <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8>)
+
+declare <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16>)
+
+declare <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8>)
+
+define i16 @test_vaddlv_s8(<8 x i8> %a) {
+; CHECK: test_vaddlv_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i16> %saddlv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_s16(<4 x i16> %a) {
+; CHECK: test_vaddlv_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i32> %saddlv.i, i32 0
+  ret i32 %0
+}
+
+define i16 @test_vaddlv_u8(<8 x i8> %a) {
+; CHECK: test_vaddlv_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i16> %uaddlv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_u16(<4 x i16> %a) {
+; CHECK: test_vaddlv_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i32> %uaddlv.i, i32 0
+  ret i32 %0
+}
+
+define i16 @test_vaddlvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %saddlv.i = tail call <1 x i16> @llvm.aarch64.neon.saddlv.v1i16.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i16> %saddlv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %saddlv.i = tail call <1 x i32> @llvm.aarch64.neon.saddlv.v1i32.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i32> %saddlv.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vaddlvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_s32:
+; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %saddlv.i = tail call <1 x i64> @llvm.aarch64.neon.saddlv.v1i64.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i64> %saddlv.i, i32 0
+  ret i64 %0
+}
+
+define i16 @test_vaddlvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uaddlv.i = tail call <1 x i16> @llvm.aarch64.neon.uaddlv.v1i16.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i16> %uaddlv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uaddlv.i = tail call <1 x i32> @llvm.aarch64.neon.uaddlv.v1i32.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i32> %uaddlv.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vaddlvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_u32:
+; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uaddlv.i = tail call <1 x i64> @llvm.aarch64.neon.uaddlv.v1i64.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i64> %uaddlv.i, i32 0
+  ret i64 %0
+}
+
+define i8 @test_vmaxv_s8(<8 x i8> %a) {
+; CHECK: test_vmaxv_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %smaxv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_s16(<4 x i16> %a) {
+; CHECK: test_vmaxv_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %smaxv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vmaxv_u8(<8 x i8> %a) {
+; CHECK: test_vmaxv_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %umaxv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_u16(<4 x i16> %a) {
+; CHECK: test_vmaxv_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %umaxv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vmaxvq_s8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %smaxv.i = tail call <1 x i8> @llvm.aarch64.neon.smaxv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %smaxv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_s16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %smaxv.i = tail call <1 x i16> @llvm.aarch64.neon.smaxv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %smaxv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_s32:
+; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %smaxv.i = tail call <1 x i32> @llvm.aarch64.neon.smaxv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %smaxv.i, i32 0
+  ret i32 %0
+}
+
+define i8 @test_vmaxvq_u8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %umaxv.i = tail call <1 x i8> @llvm.aarch64.neon.umaxv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %umaxv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_u16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %umaxv.i = tail call <1 x i16> @llvm.aarch64.neon.umaxv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %umaxv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_u32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_u32:
+; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %umaxv.i = tail call <1 x i32> @llvm.aarch64.neon.umaxv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %umaxv.i, i32 0
+  ret i32 %0
+}
+
+define i8 @test_vminv_s8(<8 x i8> %a) {
+; CHECK: test_vminv_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %sminv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vminv_s16(<4 x i16> %a) {
+; CHECK: test_vminv_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %sminv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vminv_u8(<8 x i8> %a) {
+; CHECK: test_vminv_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %uminv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vminv_u16(<4 x i16> %a) {
+; CHECK: test_vminv_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %uminv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vminvq_s8(<16 x i8> %a) {
+; CHECK: test_vminvq_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %sminv.i = tail call <1 x i8> @llvm.aarch64.neon.sminv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %sminv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vminvq_s16(<8 x i16> %a) {
+; CHECK: test_vminvq_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %sminv.i = tail call <1 x i16> @llvm.aarch64.neon.sminv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %sminv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a) {
+; CHECK: test_vminvq_s32:
+; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sminv.i = tail call <1 x i32> @llvm.aarch64.neon.sminv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %sminv.i, i32 0
+  ret i32 %0
+}
+
+define i8 @test_vminvq_u8(<16 x i8> %a) {
+; CHECK: test_vminvq_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uminv.i = tail call <1 x i8> @llvm.aarch64.neon.uminv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %uminv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vminvq_u16(<8 x i16> %a) {
+; CHECK: test_vminvq_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uminv.i = tail call <1 x i16> @llvm.aarch64.neon.uminv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %uminv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vminvq_u32(<4 x i32> %a) {
+; CHECK: test_vminvq_u32:
+; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uminv.i = tail call <1 x i32> @llvm.aarch64.neon.uminv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %uminv.i, i32 0
+  ret i32 %0
+}
+
+define i8 @test_vaddv_s8(<8 x i8> %a) {
+; CHECK: test_vaddv_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %vaddv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vaddv_s16(<4 x i16> %a) {
+; CHECK: test_vaddv_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %vaddv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vaddv_u8(<8 x i8> %a) {
+; CHECK: test_vaddv_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v8i8(<8 x i8> %a)
+  %0 = extractelement <1 x i8> %vaddv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vaddv_u16(<4 x i16> %a) {
+; CHECK: test_vaddv_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v4i16(<4 x i16> %a)
+  %0 = extractelement <1 x i16> %vaddv.i, i32 0
+  ret i16 %0
+}
+
+define i8 @test_vaddvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddvq_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %vaddv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddvq_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %vaddv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddvq_s32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %vaddv.i, i32 0
+  ret i32 %0
+}
+
+define i8 @test_vaddvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddvq_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call <1 x i8> @llvm.aarch64.neon.vaddv.v1i8.v16i8(<16 x i8> %a)
+  %0 = extractelement <1 x i8> %vaddv.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddvq_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call <1 x i16> @llvm.aarch64.neon.vaddv.v1i16.v8i16(<8 x i16> %a)
+  %0 = extractelement <1 x i16> %vaddv.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddvq_u32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call <1 x i32> @llvm.aarch64.neon.vaddv.v1i32.v4i32(<4 x i32> %a)
+  %0 = extractelement <1 x i32> %vaddv.i, i32 0
+  ret i32 %0
+}
+
+define float @test_vmaxvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxvq_f32:
+; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vmaxv.i = tail call <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v4f32(<4 x float> %a)
+  %0 = extractelement <1 x float> %vmaxv.i, i32 0
+  ret float %0
+}
+
+define float @test_vminvq_f32(<4 x float> %a) {
+; CHECK: test_vminvq_f32:
+; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vminv.i = tail call <1 x float> @llvm.aarch64.neon.vminv.v1f32.v4f32(<4 x float> %a)
+  %0 = extractelement <1 x float> %vminv.i, i32 0
+  ret float %0
+}
+
+define float @test_vmaxnmvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxnmvq_f32:
+; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vmaxnmv.i = tail call <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v4f32(<4 x float> %a)
+  %0 = extractelement <1 x float> %vmaxnmv.i, i32 0
+  ret float %0
+}
+
+define float @test_vminnmvq_f32(<4 x float> %a) {
+; CHECK: test_vminnmvq_f32:
+; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vminnmv.i = tail call <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v4f32(<4 x float> %a)
+  %0 = extractelement <1 x float> %vminnmv.i, i32 0
+  ret float %0
+}
+
diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/neon-add-sub.ll
index 65ec8a2..078ba14 100644
--- a/test/CodeGen/AArch64/neon-add-sub.ll
+++ b/test/CodeGen/AArch64/neon-add-sub.ll
@@ -118,15 +118,120 @@ define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) {
 	ret <2 x double> %tmp3
 }
 
-define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
-	%tmp3 = add <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
+define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vadd_f64
+; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fadd <1 x double> %a, %b
+  ret <1 x double> %1
 }
 
-define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) {
-;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
-	%tmp3 = sub <1 x i64> %A, %B;
-	ret <1 x i64> %tmp3
+define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmul_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %a, %b
+  ret <1 x double> %1
 }
 
+define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vdiv_f64
+; CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fdiv <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vmla_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %b, %c
+  %2 = fadd <1 x double> %1, %a
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vmls_f64
+; CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fmul <1 x double> %b, %c
+  %2 = fsub <1 x double> %a, %1
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vfms_f64
+; CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> <double -0.000000e+00>, %b
+  %2 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %1, <1 x double> %c, <1 x double> %a)
+  ret <1 x double> %2
+}
+
+define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) {
+; CHECK-LABEL: test_vfma_f64
+; CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vsub_f64
+; CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> %a, %b
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vabd_f64
+; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmax_f64
+; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmin_f64
+; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vmaxnm_f64
+; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vminnm_f64
+; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vabs_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vabs_f64
+; CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.fabs.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vneg_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vneg_f64
+; CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fsub <1 x double> <double -0.000000e+00>, %a
+  ret <1 x double> %1
+}
+
+declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
+declare <1 x double> @llvm.aarch64.neon.vminnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.vmaxnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.arm.neon.vmins.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.arm.neon.vmaxs.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.arm.neon.vabds.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-bsl.ll b/test/CodeGen/AArch64/neon-bsl.ll
new file mode 100644
index 0000000..6bd923d
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-bsl.ll
@@ -0,0 +1,222 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
+
+declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
+
+declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
+
+declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
+
+declare <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double>, <1 x double>, <1 x double>)
+
+declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>)
+
+declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>)
+
+declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
+
+define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+; CHECK-LABEL: test_vbsl_s8:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
+; CHECK-LABEL: test_vbsl_s16:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
+  %0 = bitcast <4 x i16> %vbsl3.i to <8 x i8>
+  ret <8 x i8> %0
+}
+
+define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
+; CHECK-LABEL: test_vbsl_s32:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
+  ret <2 x i32> %vbsl3.i
+}
+
+define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_vbsl_s64:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
+  ret <1 x i64> %vbsl3.i
+}
+
+define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+; CHECK-LABEL: test_vbsl_u8:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
+  ret <8 x i8> %vbsl.i
+}
+
+define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
+; CHECK-LABEL: test_vbsl_u16:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
+  ret <4 x i16> %vbsl3.i
+}
+
+define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
+; CHECK-LABEL: test_vbsl_u32:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3)
+  ret <2 x i32> %vbsl3.i
+}
+
+define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+; CHECK-LABEL: test_vbsl_u64:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3)
+  ret <1 x i64> %vbsl3.i
+}
+
+define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) {
+; CHECK-LABEL: test_vbsl_f32:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3)
+  ret <2 x float> %vbsl3.i
+}
+
+define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) {
+; CHECK-LABEL: test_vbsl_f64:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl.i = bitcast <1 x i64> %v1 to <1 x double>
+  %vbsl3.i = tail call <1 x double> @llvm.arm.neon.vbsl.v1f64(<1 x double> %vbsl.i, <1 x double> %v2, <1 x double> %v3)
+  ret <1 x double> %vbsl3.i
+}
+
+define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+; CHECK-LABEL: test_vbsl_p8:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3)
+  ret <8 x i8> %vbsl.i
+}
+
+define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
+; CHECK-LABEL: test_vbsl_p16:
+; CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3)
+  ret <4 x i16> %vbsl3.i
+}
+
+define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
+; CHECK-LABEL: test_vbslq_s8:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
+  ret <16 x i8> %vbsl.i
+}
+
+define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
+; CHECK-LABEL: test_vbslq_s16:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
+  ret <8 x i16> %vbsl3.i
+}
+
+define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+; CHECK-LABEL: test_vbslq_s32:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
+  ret <4 x i32> %vbsl3.i
+}
+
+define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
+; CHECK-LABEL: test_vbslq_s64:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
+  ret <2 x i64> %vbsl3.i
+}
+
+define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
+; CHECK-LABEL: test_vbslq_u8:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
+  ret <16 x i8> %vbsl.i
+}
+
+define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
+; CHECK-LABEL: test_vbslq_u16:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
+  ret <8 x i16> %vbsl3.i
+}
+
+define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+; CHECK-LABEL: test_vbslq_u32:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3)
+  ret <4 x i32> %vbsl3.i
+}
+
+define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
+; CHECK-LABEL: test_vbslq_u64:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3)
+  ret <2 x i64> %vbsl3.i
+}
+
+define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) {
+; CHECK-LABEL: test_vbslq_f32:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl.i = bitcast <4 x i32> %v1 to <4 x float>
+  %vbsl3.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %vbsl.i, <4 x float> %v2, <4 x float> %v3)
+  ret <4 x float> %vbsl3.i
+}
+
+define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
+; CHECK-LABEL: test_vbslq_p8:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3)
+  ret <16 x i8> %vbsl.i
+}
+
+define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
+; CHECK-LABEL: test_vbslq_p16:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3)
+  ret <8 x i16> %vbsl3.i
+}
+
+define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) {
+; CHECK-LABEL: test_vbslq_f64:
+; CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vbsl.i = bitcast <2 x i64> %v1 to <2 x double>
+  %vbsl3.i = tail call <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double> %vbsl.i, <2 x double> %v2, <2 x double> %v3)
+  ret <2 x double> %vbsl3.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll
index 0848f9b..68f0342 100644
--- a/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -51,8 +51,7 @@ define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
 
 define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <8 x i8> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
@@ -60,8 +59,7 @@ define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
 
 define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <16 x i8> %A, %B;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
@@ -69,8 +67,7 @@ define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
 
 define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <4 x i16> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
@@ -78,8 +75,7 @@ define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
 
 define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <8 x i16> %A, %B;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
@@ -87,8 +83,7 @@ define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
 
 define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <2 x i32> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -96,8 +91,7 @@ define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
 
 define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <4 x i32> %A, %B;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -105,8 +99,7 @@ define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
 
 define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
 ;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <2 x i64> %A, %B;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -867,8 +860,7 @@ define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
 
 define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
 	ret <8 x i8> %tmp4
@@ -876,8 +868,7 @@ define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
 
 define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
    %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
 	ret <16 x i8> %tmp4
@@ -885,8 +876,7 @@ define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
 
 define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
 	ret <4 x i16> %tmp4
@@ -894,8 +884,7 @@ define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
 
 define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
    %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
 	ret <8 x i16> %tmp4
@@ -903,8 +892,7 @@ define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
 
 define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -912,8 +900,7 @@ define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
 
 define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -921,8 +908,7 @@ define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
 
 define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
 ;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1369,8 +1355,7 @@ define <2 x i32> @fcmuno2xfloat(<2 x float> %A, <2 x float> %B) {
 ;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 ;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uno <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1382,8 +1367,7 @@ define <4 x i32> @fcmuno4xfloat(<4 x float> %A, <4 x float> %B) {
 ;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1395,8 +1379,7 @@ define <2 x i64> @fcmuno2xdouble(<2 x double> %A, <2 x double> %B) {
 ;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1408,8 +1391,7 @@ define <2 x i32> @fcmueq2xfloat(<2 x float> %A, <2 x float> %B) {
 ;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 ;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ueq <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1421,8 +1403,7 @@ define <4 x i32> @fcmueq4xfloat(<4 x float> %A, <4 x float> %B) {
 ;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1434,8 +1415,7 @@ define <2 x i64> @fcmueq2xdouble(<2 x double> %A, <2 x double> %B) {
 ;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
 ;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1445,8 +1425,7 @@ define <2 x i32> @fcmuge2xfloat(<2 x float> %A, <2 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uge <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1456,8 +1435,7 @@ define <4 x i32> @fcmuge4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1467,8 +1445,7 @@ define <2 x i64> @fcmuge2xdouble(<2 x double> %A, <2 x double> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGE = ULE with swapped operands, ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1478,8 +1455,7 @@ define <2 x i32> @fcmugt2xfloat(<2 x float> %A, <2 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGT = ULT with swapped operands, ULT implemented as !OGE.
 ;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ugt <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1489,16 +1465,14 @@ define <4 x i32> @fcmugt4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UGT = ULT with swapped operands, ULT implemented as !OGE.
 ;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
 }
 define <2 x i64> @fcmugt2xdouble(<2 x double> %A, <2 x double> %B) {
 ;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1508,8 +1482,7 @@ define <2 x i32> @fcmule2xfloat(<2 x float> %A, <2 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ule <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1519,8 +1492,7 @@ define <4 x i32> @fcmule4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1529,8 +1501,7 @@ define <2 x i64> @fcmule2xdouble(<2 x double> %A, <2 x double> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULE implemented as !OGT.
 ;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1540,8 +1511,7 @@ define <2 x i32> @fcmult2xfloat(<2 x float> %A, <2 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
 ;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ult <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1551,8 +1521,7 @@ define <4 x i32> @fcmult4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
 ;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1561,8 +1530,7 @@ define <2 x i64> @fcmult2xdouble(<2 x double> %A, <2 x double> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; ULT implemented as !OGE.
 ;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1572,8 +1540,7 @@ define <2 x i32> @fcmune2xfloat(<2 x float> %A, <2 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
 ;CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp une <2 x float> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1583,8 +1550,7 @@ define <4 x i32> @fcmune4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
 ;CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <4 x float> %A, %B
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1593,8 +1559,7 @@ define <2 x i64> @fcmune2xdouble(<2 x double> %A, <2 x double> %B) {
 ; Using registers other than v0, v1 are possible, but would be odd.
 ; UNE = !OEQ.
 ;CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <2 x double> %A, %B
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1766,8 +1731,7 @@ define <2 x i32> @fcmueqz2xfloat(<2 x float> %A) {
 ;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ueq <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1778,8 +1742,7 @@ define <4 x i32> @fcmueqz4xfloat(<4 x float> %A) {
 ;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1790,8 +1753,7 @@ define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) {
 ;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ueq <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1800,8 +1762,7 @@ define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) {
 define <2 x i32> @fcmugez2xfloat(<2 x float> %A) {
 ; UGE with zero = !OLT
 ;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uge <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1810,8 +1771,7 @@ define <2 x i32> @fcmugez2xfloat(<2 x float> %A) {
 define <4 x i32> @fcmugez4xfloat(<4 x float> %A) {
 ; UGE with zero = !OLT
 ;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1819,8 +1779,7 @@ define <4 x i32> @fcmugez4xfloat(<4 x float> %A) {
 define <2 x i64> @fcmugez2xdouble(<2 x double> %A) {
 ; UGE with zero = !OLT
 ;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uge <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1829,8 +1788,7 @@ define <2 x i64> @fcmugez2xdouble(<2 x double> %A) {
 define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) {
 ; UGT with zero = !OLE
 ;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ugt <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1839,8 +1797,7 @@ define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) {
 define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) {
 ; UGT with zero = !OLE
 ;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1848,8 +1805,7 @@ define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) {
 define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) {
 ; UGT with zero = !OLE
 ;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ugt <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1858,8 +1814,7 @@ define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) {
 define <2 x i32> @fcmultz2xfloat(<2 x float> %A) {
 ; ULT with zero = !OGE
 ;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ult <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1867,8 +1822,7 @@ define <2 x i32> @fcmultz2xfloat(<2 x float> %A) {
 
 define <4 x i32> @fcmultz4xfloat(<4 x float> %A) {
 ;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1876,8 +1830,7 @@ define <4 x i32> @fcmultz4xfloat(<4 x float> %A) {
 
 define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
 ;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ult <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1887,8 +1840,7 @@ define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
 define <2 x i32> @fcmulez2xfloat(<2 x float> %A) {
 ; ULE with zero = !OGT
 ;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp ule <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1897,8 +1849,7 @@ define <2 x i32> @fcmulez2xfloat(<2 x float> %A) {
 define <4 x i32> @fcmulez4xfloat(<4 x float> %A) {
 ; ULE with zero = !OGT
 ;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1907,8 +1858,7 @@ define <4 x i32> @fcmulez4xfloat(<4 x float> %A) {
 define <2 x i64> @fcmulez2xdouble(<2 x double> %A) {
 ; ULE with zero = !OGT
 ;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp ule <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1917,8 +1867,7 @@ define <2 x i64> @fcmulez2xdouble(<2 x double> %A) {
 define <2 x i32> @fcmunez2xfloat(<2 x float> %A) {
 ; UNE with zero = !OEQ with zero
 ;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp une <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1927,8 +1876,7 @@ define <2 x i32> @fcmunez2xfloat(<2 x float> %A) {
 define <4 x i32> @fcmunez4xfloat(<4 x float> %A) {
 ; UNE with zero = !OEQ with zero
 ;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1936,8 +1884,7 @@ define <4 x i32> @fcmunez4xfloat(<4 x float> %A) {
 define <2 x i64> @fcmunez2xdouble(<2 x double> %A) {
 ; UNE with zero = !OEQ with zero
 ;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp une <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
@@ -1949,8 +1896,7 @@ define <2 x i32> @fcmunoz2xfloat(<2 x float> %A) {
 ;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
    %tmp3 = fcmp uno <2 x float> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
 	ret <2 x i32> %tmp4
@@ -1961,8 +1907,7 @@ define <4 x i32> @fcmunoz4xfloat(<4 x float> %A) {
 ;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <4 x float> %A, zeroinitializer
    %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
 	ret <4 x i32> %tmp4
@@ -1973,8 +1918,7 @@ define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) {
 ;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
 ;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
 ;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
-;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
    %tmp3 = fcmp uno <2 x double> %A, zeroinitializer
    %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
 	ret <2 x i64> %tmp4
diff --git a/test/CodeGen/AArch64/neon-copy.ll b/test/CodeGen/AArch64/neon-copy.ll
new file mode 100644
index 0000000..e18530e
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-copy.ll
@@ -0,0 +1,615 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+
+define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[15], {{w[0-31]+}}
+  %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 15
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @ins8hw(<8 x i16> %tmp1, i16 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[6], {{w[0-31]+}}
+  %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 6
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ins4sw(<4 x i32> %tmp1, i32 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[2], {{w[0-31]+}}
+  %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 2
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ins2dw(<2 x i64> %tmp1, i64 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[1], {{x[0-31]+}}
+  %tmp3 = insertelement <2 x i64> %tmp1, i64 %tmp2, i32 1
+  ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @ins8bw(<8 x i8> %tmp1, i8 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[5], {{w[0-31]+}}
+  %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 5
+  ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @ins4hw(<4 x i16> %tmp1, i16 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[3], {{w[0-31]+}}
+  %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 3
+  ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @ins2sw(<2 x i32> %tmp1, i32 %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{w[0-31]+}}
+  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
+  ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @ins16b16(<16 x i8> %tmp1, <16 x i8> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
+  ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @ins8h8(<8 x i16> %tmp1, <8 x i16> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ins4s4(<4 x i32> %tmp1, <4 x i32> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ins2d2(<2 x i64> %tmp1, <2 x i64> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
+  ret <2 x i64> %tmp4
+}
+
+define <4 x float> @ins4f4(<4 x float> %tmp1, <4 x float> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x float> %tmp1, i32 2
+  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @ins2df2(<2 x double> %tmp1, <2 x double> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <2 x double> %tmp1, i32 0
+  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
+  ret <2 x double> %tmp4
+}
+
+define <16 x i8> @ins8b16(<8 x i8> %tmp1, <16 x i8> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[15], {{v[0-31]+}}.b[2]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <16 x i8> %tmp2, i8 %tmp3, i32 15
+  ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @ins4h8(<4 x i16> %tmp1, <8 x i16> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[7], {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i16> %tmp2, i16 %tmp3, i32 7
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ins2s4(<2 x i32> %tmp1, <4 x i32> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ins1d2(<1 x i64> %tmp1, <2 x i64> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i64> %tmp2, i64 %tmp3, i32 1
+  ret <2 x i64> %tmp4
+}
+
+define <4 x float> @ins2f4(<2 x float> %tmp1, <4 x float> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[1]
+  %tmp3 = extractelement <2 x float> %tmp1, i32 1
+  %tmp4 = insertelement <4 x float> %tmp2, float %tmp3, i32 1
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @ins1f2(<1 x double> %tmp1, <2 x double> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[1], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <1 x double> %tmp1, i32 0
+  %tmp4 = insertelement <2 x double> %tmp2, double %tmp3, i32 1
+  ret <2 x double> %tmp4
+}
+
+define <8 x i8> @ins16b8(<16 x i8> %tmp1, <8 x i8> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[7], {{v[0-31]+}}.b[2]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 7
+  ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @ins8h4(<8 x i16> %tmp1, <4 x i16> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
+  ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
+  ret <1 x i64> %tmp4
+}
+
+define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x float> %tmp1, i32 2
+  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
+  ret <2 x float> %tmp4
+}
+
+define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <2 x double> %tmp1, i32 0
+  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
+  ret <1 x double> %tmp4
+}
+
+define <8 x i8> @ins8b8(<8 x i8> %tmp1, <8 x i8> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.b[4], {{v[0-31]+}}.b[2]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 2
+  %tmp4 = insertelement <8 x i8> %tmp2, i8 %tmp3, i32 4
+  ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @ins4h4(<4 x i16> %tmp1, <4 x i16> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.h[3], {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = insertelement <4 x i16> %tmp2, i16 %tmp3, i32 3
+  ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @ins2s2(<2 x i32> %tmp1, <2 x i32> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[0]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 0
+  %tmp4 = insertelement <2 x i32> %tmp2, i32 %tmp3, i32 1
+  ret <2 x i32> %tmp4
+}
+
+define <1 x i64> @ins1d1(<1 x i64> %tmp1, <1 x i64> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0
+  ret <1 x i64> %tmp4
+}
+
+define <2 x float> @ins2f2(<2 x float> %tmp1, <2 x float> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.s[1], {{v[0-31]+}}.s[0]
+  %tmp3 = extractelement <2 x float> %tmp1, i32 0
+  %tmp4 = insertelement <2 x float> %tmp2, float %tmp3, i32 1
+  ret <2 x float> %tmp4
+}
+
+define <1 x double> @ins1df1(<1 x double> %tmp1, <1 x double> %tmp2) {
+;CHECK: ins {{v[0-31]+}}.d[0], {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <1 x double> %tmp1, i32 0
+  %tmp4 = insertelement <1 x double> %tmp2, double %tmp3, i32 0
+  ret <1 x double> %tmp4
+}
+
+define i32 @umovw16b(<16 x i8> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = zext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw8h(<8 x i16> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = zext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw4s(<4 x i32> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  ret i32 %tmp3
+}
+
+define i64 @umovx2d(<2 x i64> %tmp1) {
+;CHECK: umov {{x[0-31]+}}, {{v[0-31]+}}.d[0]
+  %tmp3 = extractelement <2 x i64> %tmp1, i32 0
+  ret i64 %tmp3
+}
+
+define i32 @umovw8b(<8 x i8> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.b[7]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 7
+  %tmp4 = zext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw4h(<4 x i16> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = zext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @umovw2s(<2 x i32> %tmp1) {
+;CHECK: umov {{w[0-31]+}}, {{v[0-31]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  ret i32 %tmp3
+}
+
+define i64 @umovx1d(<1 x i64> %tmp1) {
+;CHECK: fmov {{x[0-31]+}}, {{d[0-31]+}}
+  %tmp3 = extractelement <1 x i64> %tmp1, i32 0
+  ret i64 %tmp3
+}
+
+define i32 @smovw16b(<16 x i8> %tmp1) {
+;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = add i32 5, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovw8h(<8 x i16> %tmp1) {
+;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  %tmp5 = add i32 5, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovx16b(<16 x i8> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.b[8]
+  %tmp3 = extractelement <16 x i8> %tmp1, i32 8
+  %tmp4 = sext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @smovx8h(<8 x i16> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <8 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i64 @smovx4s(<4 x i32> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.s[2]
+  %tmp3 = extractelement <4 x i32> %tmp1, i32 2
+  %tmp4 = sext i32 %tmp3 to i64
+  ret i64 %tmp4
+}
+
+define i32 @smovw8b(<8 x i8> %tmp1) {
+;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.b[4]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 4
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = add i32 5, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovw4h(<4 x i16> %tmp1) {
+;CHECK: smov {{w[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  %tmp5 = add i32 5, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @smovx8b(<8 x i8> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.b[6]
+  %tmp3 = extractelement <8 x i8> %tmp1, i32 6
+  %tmp4 = sext i8 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i32 @smovx4h(<4 x i16> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.h[2]
+  %tmp3 = extractelement <4 x i16> %tmp1, i32 2
+  %tmp4 = sext i16 %tmp3 to i32
+  ret i32 %tmp4
+}
+
+define i64 @smovx2s(<2 x i32> %tmp1) {
+;CHECK: smov {{x[0-31]+}}, {{v[0-31]+}}.s[1]
+  %tmp3 = extractelement <2 x i32> %tmp1, i32 1
+  %tmp4 = sext i32 %tmp3 to i64
+  ret i64 %tmp4
+}
+
+define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
+;CHECK: ins  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
+  ret <8 x i8> %vset_lane
+}
+
+define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
+;CHECK: ins  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
+  ret <16 x i8> %vset_lane
+}
+
+define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
+;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
+  ret <8 x i8> %vset_lane
+}
+
+define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
+;CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vset_lane
+}
+
+define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7
+  ret <8 x i8> %vecinit7.i
+}
+
+define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
+  %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
+;CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+  %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
+  ret <1 x i64> %vecinit.i
+}
+
+define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+  %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7
+  %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8
+  %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9
+  %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10
+  %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11
+  %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12
+  %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13
+  %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14
+  %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15
+  ret <16 x i8> %vecinit15.i
+}
+
+define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7
+  ret <8 x i16> %vecinit7.i
+}
+
+define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3
+  ret <4 x i32> %vecinit3.i
+}
+
+define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
+  %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
+  ret <2 x i64> %vecinit1.i
+}
+
+define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
+;CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
+;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
+;CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
+;CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}
+
+define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
+;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}
+
+define i64 @test_bitcastv8i8toi64(<8 x i8> %in) {
+; CHECK-LABEL: test_bitcastv8i8toi64:
+   %res = bitcast <8 x i8> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv4i16toi64(<4 x i16> %in) {
+; CHECK-LABEL: test_bitcastv4i16toi64:
+   %res = bitcast <4 x i16> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv2i32toi64(<2 x i32> %in) {
+; CHECK-LABEL: test_bitcastv2i32toi64:
+   %res = bitcast <2 x i32> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv2f32toi64(<2 x float> %in) {
+; CHECK-LABEL: test_bitcastv2f32toi64:
+   %res = bitcast <2 x float> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv1i64toi64(<1 x i64> %in) {
+; CHECK-LABEL: test_bitcastv1i64toi64:
+   %res = bitcast <1 x i64> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define i64 @test_bitcastv1f64toi64(<1 x double> %in) {
+; CHECK-LABEL: test_bitcastv1f64toi64:
+   %res = bitcast <1 x double> %in to i64
+; CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
+   ret i64 %res
+}
+
+define <8 x i8> @test_bitcasti64tov8i8(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov8i8:
+   %res = bitcast i64 %in to <8 x i8>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <8 x i8> %res
+}
+
+define <4 x i16> @test_bitcasti64tov4i16(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov4i16:
+   %res = bitcast i64 %in to <4 x i16>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <4 x i16> %res
+}
+
+define <2 x i32> @test_bitcasti64tov2i32(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov2i32:
+   %res = bitcast i64 %in to <2 x i32>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <2 x i32> %res
+}
+
+define <2 x float> @test_bitcasti64tov2f32(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov2f32:
+   %res = bitcast i64 %in to <2 x float>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <2 x float> %res
+}
+
+define <1 x i64> @test_bitcasti64tov1i64(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov1i64:
+   %res = bitcast i64 %in to <1 x i64>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <1 x i64> %res
+}
+
+define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
+; CHECK-LABEL: test_bitcasti64tov1f64:
+   %res = bitcast i64 %in to <1 x double>
+; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+   ret <1 x double> %res
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-crypto.ll b/test/CodeGen/AArch64/neon-crypto.ll
new file mode 100644
index 0000000..0283e0e
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-crypto.ll
@@ -0,0 +1,149 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -mattr=+crypto | FileCheck %s
+; RUN: not llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s
+
+declare <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.aarch64.neon.sha1m(<4 x i32>, <1 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.aarch64.neon.sha1p(<4 x i32>, <1 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.aarch64.neon.sha1c(<4 x i32>, <1 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32>, <4 x i32>) #1
+
+declare <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32>, <4 x i32>) #1
+
+declare <1 x i32> @llvm.arm.neon.sha1h.v1i32(<1 x i32>) #1
+
+declare <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8>) #1
+
+declare <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8>) #1
+
+declare <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8>, <16 x i8>) #1
+
+declare <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8>, <16 x i8>) #1
+
+define <16 x i8> @test_vaeseq_u8(<16 x i8> %data, <16 x i8> %key) {
+; CHECK: test_vaeseq_u8:
+; CHECK: aese {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-NO-CRYPTO: Cannot select: intrinsic %llvm.arm.neon.aese
+entry:
+  %aese.i = tail call <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %aese.i
+}
+
+define <16 x i8> @test_vaesdq_u8(<16 x i8> %data, <16 x i8> %key) {
+; CHECK: test_vaesdq_u8:
+; CHECK: aesd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %aesd.i = tail call <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %aesd.i
+}
+
+define <16 x i8> @test_vaesmcq_u8(<16 x i8> %data) {
+; CHECK: test_vaesmcq_u8:
+; CHECK: aesmc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %aesmc.i = tail call <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8> %data)
+  ret <16 x i8> %aesmc.i
+}
+
+define <16 x i8> @test_vaesimcq_u8(<16 x i8> %data) {
+; CHECK: test_vaesimcq_u8:
+; CHECK: aesimc {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %aesimc.i = tail call <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8> %data)
+  ret <16 x i8> %aesimc.i
+}
+
+define i32 @test_vsha1h_u32(i32 %hash_e) {
+; CHECK: test_vsha1h_u32:
+; CHECK: sha1h {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %sha1h.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
+  %sha1h1.i = tail call <1 x i32> @llvm.arm.neon.sha1h.v1i32(<1 x i32> %sha1h.i)
+  %0 = extractelement <1 x i32> %sha1h1.i, i32 0
+  ret i32 %0
+}
+
+define <4 x i32> @test_vsha1su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w12_15) {
+; CHECK: test_vsha1su1q_u32:
+; CHECK: sha1su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %sha1su12.i = tail call <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32> %tw0_3, <4 x i32> %w12_15)
+  ret <4 x i32> %sha1su12.i
+}
+
+define <4 x i32> @test_vsha256su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7) {
+; CHECK: test_vsha256su0q_u32:
+; CHECK: sha256su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %sha256su02.i = tail call <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32> %w0_3, <4 x i32> %w4_7)
+  ret <4 x i32> %sha256su02.i
+}
+
+define <4 x i32> @test_vsha1cq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK: test_vsha1cq_u32:
+; CHECK: sha1c {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sha1c.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
+  %sha1c1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1c(<4 x i32> %hash_abcd, <1 x i32> %sha1c.i, <4 x i32> %wk)
+  ret <4 x i32> %sha1c1.i
+}
+
+define <4 x i32> @test_vsha1pq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK: test_vsha1pq_u32:
+; CHECK: sha1p {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sha1p.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
+  %sha1p1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1p(<4 x i32> %hash_abcd, <1 x i32> %sha1p.i, <4 x i32> %wk)
+  ret <4 x i32> %sha1p1.i
+}
+
+define <4 x i32> @test_vsha1mq_u32(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK: test_vsha1mq_u32:
+; CHECK: sha1m {{q[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sha1m.i = insertelement <1 x i32> undef, i32 %hash_e, i32 0
+  %sha1m1.i = tail call <4 x i32> @llvm.aarch64.neon.sha1m(<4 x i32> %hash_abcd, <1 x i32> %sha1m.i, <4 x i32> %wk)
+  ret <4 x i32> %sha1m1.i
+}
+
+define <4 x i32> @test_vsha1su0q_u32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11) {
+; CHECK: test_vsha1su0q_u32:
+; CHECK: sha1su0 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %sha1su03.i = tail call <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32> %w0_3, <4 x i32> %w4_7, <4 x i32> %w8_11)
+  ret <4 x i32> %sha1su03.i
+}
+
+define <4 x i32> @test_vsha256hq_u32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
+; CHECK: test_vsha256hq_u32:
+; CHECK: sha256h {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sha256h3.i = tail call <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+  ret <4 x i32> %sha256h3.i
+}
+
+define <4 x i32> @test_vsha256h2q_u32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
+; CHECK: test_vsha256h2q_u32:
+; CHECK: sha256h2 {{q[0-9]+}}, {{q[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sha256h23.i = tail call <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+  ret <4 x i32> %sha256h23.i
+}
+
+define <4 x i32> @test_vsha256su1q_u32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
+; CHECK: test_vsha256su1q_u32:
+; CHECK: sha256su1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %sha256su13.i = tail call <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32> %tw0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+  ret <4 x i32> %sha256su13.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-diagnostics.ll b/test/CodeGen/AArch64/neon-diagnostics.ll
new file mode 100644
index 0000000..f546aa7
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-diagnostics.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
+; CHECK: test_vfma_lane_f32:
+; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}]
+; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+entry:
+  %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %mul = fmul <2 x float> %shuffle, %b
+  %add = fadd <2 x float> %mul, %a
+  ret <2 x float> %add
+}
+
+define <4 x i32> @test_vshrn_not_match(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_not_match
+; CHECK-NOT: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #35
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = ashr <2 x i64> %b, <i64 35, i64 35>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-extract.ll b/test/CodeGen/AArch64/neon-extract.ll
new file mode 100644
index 0000000..5c52cd3
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-extract.ll
@@ -0,0 +1,190 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vext_s8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+entry:
+  %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x i8> %vext
+}
+
+define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vext_s16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+entry:
+  %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i16> %vext
+}
+
+define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vext_s32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+entry:
+  %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x i32> %vext
+}
+
+define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK: test_vext_s64:
+entry:
+  %vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0>
+  ret <1 x i64> %vext
+}
+
+define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vextq_s8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+entry:
+  %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  ret <16 x i8> %vext
+}
+
+define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vextq_s16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+entry:
+  %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x i16> %vext
+}
+
+define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vextq_s32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+entry:
+  %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %vext
+}
+
+define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vextq_s64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+entry:
+  %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x i64> %vext
+}
+
+define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vext_u8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+entry:
+  %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x i8> %vext
+}
+
+define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vext_u16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+entry:
+  %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i16> %vext
+}
+
+define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vext_u32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+entry:
+  %vext = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x i32> %vext
+}
+
+define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK: test_vext_u64:
+entry:
+  %vext = shufflevector <1 x i64> %a, <1 x i64> %b, <1 x i32> <i32 0>
+  ret <1 x i64> %vext
+}
+
+define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vextq_u8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+entry:
+  %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  ret <16 x i8> %vext
+}
+
+define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vextq_u16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+entry:
+  %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x i16> %vext
+}
+
+define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vextq_u32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+entry:
+  %vext = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %vext
+}
+
+define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vextq_u64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+entry:
+  %vext = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x i64> %vext
+}
+
+define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vext_f32:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x4
+entry:
+  %vext = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %vext
+}
+
+define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK: test_vext_f64:
+entry:
+  %vext = shufflevector <1 x double> %a, <1 x double> %b, <1 x i32> <i32 0>
+  ret <1 x double> %vext
+}
+
+define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vextq_f32:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x4
+entry:
+  %vext = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x float> %vext
+}
+
+define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vextq_f64:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x8
+entry:
+  %vext = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 2>
+  ret <2 x double> %vext
+}
+
+define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vext_p8:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x2
+entry:
+  %vext = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x i8> %vext
+}
+
+define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vext_p16:
+; CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x6
+entry:
+  %vext = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i16> %vext
+}
+
+define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vextq_p8:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x2
+entry:
+  %vext = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  ret <16 x i8> %vext
+}
+
+define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vextq_p16:
+; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x6
+entry:
+  %vext = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  ret <8 x i16> %vext
+}
diff --git a/test/CodeGen/AArch64/neon-misc-scalar.ll b/test/CodeGen/AArch64/neon-misc-scalar.ll
new file mode 100644
index 0000000..cca8deb
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-misc-scalar.ll
@@ -0,0 +1,60 @@
+;RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
+
+declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
+
+declare <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64>)
+
+declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>)
+
+declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) {
+entry:
+  ; CHECK: test_vuqadd_s64
+  %vuqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
+  ; CHECK: suqadd d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %vuqadd2.i
+}
+
+define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) {
+entry:
+  ; CHECK: test_vsqadd_u64
+  %vsqadd2.i = tail call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b)
+  ; CHECK: usqadd d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %vsqadd2.i
+}
+
+define <1 x i64> @test_vabs_s64(<1 x i64> %a) {
+  ; CHECK: test_vabs_s64
+entry:
+  %vabs1.i = tail call <1 x i64> @llvm.arm.neon.vabs.v1i64(<1 x i64> %a)
+  ; CHECK: abs d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %vabs1.i
+}
+
+define <1 x i64> @test_vqabs_s64(<1 x i64> %a) {
+  ; CHECK: test_vqabs_s64
+entry:
+  %vqabs1.i = tail call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %a)
+  ; CHECK: sqabs d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %vqabs1.i
+}
+
+define <1 x i64> @test_vqneg_s64(<1 x i64> %a) {
+  ; CHECK: test_vqneg_s64
+entry:
+  %vqneg1.i = tail call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %a)
+  ; CHECK: sqneg d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %vqneg1.i
+}
+
+define <1 x i64> @test_vneg_s64(<1 x i64> %a) {
+  ; CHECK: test_vneg_s64
+entry:
+  %sub.i = sub <1 x i64> zeroinitializer, %a
+  ; CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
+  ret <1 x i64> %sub.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-misc.ll b/test/CodeGen/AArch64/neon-misc.ll
new file mode 100644
index 0000000..9660bf2
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-misc.ll
@@ -0,0 +1,1799 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+
+define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
+; CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
+; CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
+; CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
+; CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x i16> %shuffle.i
+}
+
+define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
+; CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
+; CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+  ret <2 x i32> %shuffle.i
+}
+
+define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+  ret <2 x float> %shuffle.i
+}
+
+define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x i32> %shuffle.i
+}
+
+define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
+; CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x float> %shuffle.i
+}
+
+define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
+  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
+  ret <4 x i16> %vpaddl.i
+}
+
+define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
+  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a) #4
+  ret <2 x i32> %vpaddl1.i
+}
+
+define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
+  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a) #4
+  ret <1 x i64> %vpaddl1.i
+}
+
+define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
+  %vpaddl.i = tail call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
+  ret <4 x i16> %vpaddl.i
+}
+
+define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
+  %vpaddl1.i = tail call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a) #4
+  ret <2 x i32> %vpaddl1.i
+}
+
+define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
+  %vpaddl1.i = tail call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a) #4
+  ret <1 x i64> %vpaddl1.i
+}
+
+define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
+  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
+  ret <8 x i16> %vpaddl.i
+}
+
+define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
+  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a) #4
+  ret <4 x i32> %vpaddl1.i
+}
+
+define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
+; CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
+  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a) #4
+  ret <2 x i64> %vpaddl1.i
+}
+
+define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
+  %vpaddl.i = tail call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
+  ret <8 x i16> %vpaddl.i
+}
+
+define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
+  %vpaddl1.i = tail call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a) #4
+  ret <4 x i32> %vpaddl1.i
+}
+
+define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
+; CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
+  %vpaddl1.i = tail call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a) #4
+  ret <2 x i64> %vpaddl1.i
+}
+
+define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
+  %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
+  ret <4 x i16> %vpadal1.i
+}
+
+define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
+  %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
+  ret <2 x i32> %vpadal2.i
+}
+
+define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
+  %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
+  ret <1 x i64> %vpadal2.i
+}
+
+define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
+  %vpadal1.i = tail call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) #4
+  ret <4 x i16> %vpadal1.i
+}
+
+define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
+  %vpadal2.i = tail call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) #4
+  ret <2 x i32> %vpadal2.i
+}
+
+define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
+  %vpadal2.i = tail call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) #4
+  ret <1 x i64> %vpadal2.i
+}
+
+define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
+  %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
+  ret <8 x i16> %vpadal1.i
+}
+
+define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
+  %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
+  ret <4 x i32> %vpadal2.i
+}
+
+define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+; CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
+  %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
+  ret <2 x i64> %vpadal2.i
+}
+
+define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
+  %vpadal1.i = tail call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) #4
+  ret <8 x i16> %vpadal1.i
+}
+
+define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
+  %vpadal2.i = tail call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) #4
+  ret <4 x i32> %vpadal2.i
+}
+
+define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+; CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
+  %vpadal2.i = tail call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) #4
+  ret <2 x i64> %vpadal2.i
+}
+
+define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vqabs.i = tail call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vqabs.i
+}
+
+define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vqabs.i = tail call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vqabs.i
+}
+
+define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vqabs1.i = tail call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a) #4
+  ret <4 x i16> %vqabs1.i
+}
+
+define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vqabs1.i = tail call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a) #4
+  ret <8 x i16> %vqabs1.i
+}
+
+define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vqabs1.i = tail call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a) #4
+  ret <2 x i32> %vqabs1.i
+}
+
+define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vqabs1.i = tail call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a) #4
+  ret <4 x i32> %vqabs1.i
+}
+
+define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 {
+; CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vqabs1.i = tail call <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64> %a) #4
+  ret <2 x i64> %vqabs1.i
+}
+
+define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vqneg.i = tail call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vqneg.i
+}
+
+define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vqneg.i = tail call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vqneg.i
+}
+
+define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vqneg1.i = tail call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a) #4
+  ret <4 x i16> %vqneg1.i
+}
+
+define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vqneg1.i = tail call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a) #4
+  ret <8 x i16> %vqneg1.i
+}
+
+define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vqneg1.i = tail call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a) #4
+  ret <2 x i32> %vqneg1.i
+}
+
+define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vqneg1.i = tail call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a) #4
+  ret <4 x i32> %vqneg1.i
+}
+
+define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 {
+; CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vqneg1.i = tail call <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64> %a) #4
+  ret <2 x i64> %vqneg1.i
+}
+
+define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %sub.i = sub <8 x i8> zeroinitializer, %a
+  ret <8 x i8> %sub.i
+}
+
+define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %sub.i = sub <16 x i8> zeroinitializer, %a
+  ret <16 x i8> %sub.i
+}
+
+define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %sub.i = sub <4 x i16> zeroinitializer, %a
+  ret <4 x i16> %sub.i
+}
+
+define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %sub.i = sub <8 x i16> zeroinitializer, %a
+  ret <8 x i16> %sub.i
+}
+
+define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %sub.i = sub <2 x i32> zeroinitializer, %a
+  ret <2 x i32> %sub.i
+}
+
+define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %sub.i = sub <4 x i32> zeroinitializer, %a
+  ret <4 x i32> %sub.i
+}
+
+define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 {
+; CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %sub.i = sub <2 x i64> zeroinitializer, %a
+  ret <2 x i64> %sub.i
+}
+
+define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
+; CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %sub.i = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+  ret <2 x float> %sub.i
+}
+
+define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
+; CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+  ret <4 x float> %sub.i
+}
+
+define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 {
+; CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
+  ret <2 x double> %sub.i
+}
+
+define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vabs.i = tail call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vabs.i
+}
+
+define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vabs.i = tail call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vabs.i
+}
+
+define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vabs1.i = tail call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a) #4
+  ret <4 x i16> %vabs1.i
+}
+
+define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vabs1.i = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a) #4
+  ret <8 x i16> %vabs1.i
+}
+
+define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vabs1.i = tail call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a) #4
+  ret <2 x i32> %vabs1.i
+}
+
+define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vabs1.i = tail call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a) #4
+  ret <4 x i32> %vabs1.i
+}
+
+define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 {
+; CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vabs1.i = tail call <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64> %a) #4
+  ret <2 x i64> %vabs1.i
+}
+
+define <2 x float> @test_vabs_f32(<2 x float> %a) #1 {
+; CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vabs1.i = tail call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vabs1.i
+}
+
+define <4 x float> @test_vabsq_f32(<4 x float> %a) #1 {
+; CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vabs1.i = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vabs1.i
+}
+
+define <2 x double> @test_vabsq_f64(<2 x double> %a) #1 {
+; CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vabs1.i = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vabs1.i
+}
+
+define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vuqadd.i = tail call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+  ret <8 x i8> %vuqadd.i
+}
+
+define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vuqadd.i = tail call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+  ret <16 x i8> %vuqadd.i
+}
+
+define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vuqadd2.i = tail call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) #4
+  ret <4 x i16> %vuqadd2.i
+}
+
+define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vuqadd2.i = tail call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) #4
+  ret <8 x i16> %vuqadd2.i
+}
+
+define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vuqadd2.i = tail call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) #4
+  ret <2 x i32> %vuqadd2.i
+}
+
+define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vuqadd2.i = tail call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) #4
+  ret <4 x i32> %vuqadd2.i
+}
+
+define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vuqadd2.i = tail call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) #4
+  ret <2 x i64> %vuqadd2.i
+}
+
+define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vcls.i = tail call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vcls.i
+}
+
+define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vcls.i = tail call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vcls.i
+}
+
+define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vcls1.i = tail call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) #4
+  ret <4 x i16> %vcls1.i
+}
+
+define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vcls1.i = tail call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) #4
+  ret <8 x i16> %vcls1.i
+}
+
+define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcls1.i = tail call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) #4
+  ret <2 x i32> %vcls1.i
+}
+
+define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
+; CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcls1.i = tail call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) #4
+  ret <4 x i32> %vcls1.i
+}
+
+define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
+  ret <8 x i8> %vclz.i
+}
+
+define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
+  ret <16 x i8> %vclz.i
+}
+
+define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) #4
+  ret <4 x i16> %vclz1.i
+}
+
+define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) #4
+  ret <8 x i16> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) #4
+  ret <2 x i32> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
+; CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) #4
+  ret <4 x i32> %vclz1.i
+}
+
+define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
+; CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vctpop.i = tail call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vctpop.i
+}
+
+define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
+; CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vctpop.i = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vctpop.i
+}
+
+define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
+; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %neg.i = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  ret <8 x i8> %neg.i
+}
+
+define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
+; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %neg.i = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  ret <16 x i8> %neg.i
+}
+
+define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
+; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %neg.i = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+  ret <4 x i16> %neg.i
+}
+
+define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
+; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %neg.i = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  ret <8 x i16> %neg.i
+}
+
+define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
+; CHECK: not v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %neg.i = xor <2 x i32> %a, <i32 -1, i32 -1>
+  ret <2 x i32> %neg.i
+}
+
+define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
+; CHECK: not v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %neg.i = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %neg.i
+}
+
+define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 {
+; CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+  %vrbit.i = tail call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #4
+  ret <8 x i8> %vrbit.i
+}
+
+define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 {
+; CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+  %vrbit.i = tail call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #4
+  ret <16 x i8> %vrbit.i
+}
+
+define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
+; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
+  %vmovn.i = trunc <8 x i16> %a to <8 x i8>
+  ret <8 x i8> %vmovn.i
+}
+
+define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
+; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vmovn.i = trunc <4 x i32> %a to <4 x i16>
+  ret <4 x i16> %vmovn.i
+}
+
+define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
+; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vmovn.i = trunc <2 x i64> %a to <2 x i32>
+  ret <2 x i32> %vmovn.i
+}
+
+define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+; CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
+  %vmovn.i.i = trunc <8 x i16> %b to <8 x i8>
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vmovn.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+; CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
+  %vmovn.i.i = trunc <4 x i32> %b to <4 x i16>
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vmovn.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+; CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
+  %vmovn.i.i = trunc <2 x i64> %b to <2 x i32>
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vmovn.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
+; CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
+  %vqdmull1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a) #4
+  ret <8 x i8> %vqdmull1.i
+}
+
+define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
+; CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vqdmull1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a) #4
+  ret <4 x i16> %vqdmull1.i
+}
+
+define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
+; CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vqdmull1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a) #4
+  ret <2 x i32> %vqdmull1.i
+}
+
+define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+; CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
+  %vqdmull1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %b) #4
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqdmull1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+; CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
+  %vqdmull1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %b) #4
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqdmull1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+; CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
+  %vqdmull1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %b) #4
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqdmull1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
+; CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
+  %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a) #4
+  ret <8 x i8> %vqmovn1.i
+}
+
+define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
+; CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a) #4
+  ret <4 x i16> %vqmovn1.i
+}
+
+define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
+; CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a) #4
+  ret <2 x i32> %vqmovn1.i
+}
+
+define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+; CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
+  %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %b) #4
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+; CHECK: test_vqmovn_high_s32
+  %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %b) #4
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+; CHECK: test_vqmovn_high_s64
+  %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %b) #4
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
+; CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
+  %vqmovn1.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a) #4
+  ret <8 x i8> %vqmovn1.i
+}
+
+define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
+; CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vqmovn1.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a) #4
+  ret <4 x i16> %vqmovn1.i
+}
+
+define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
+; CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vqmovn1.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a) #4
+  ret <2 x i32> %vqmovn1.i
+}
+
+define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+; CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
+  %vqmovn1.i.i = tail call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %b) #4
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %vqmovn1.i.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle.i
+}
+
+define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+; CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
+  %vqmovn1.i.i = tail call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %b) #4
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vqmovn1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+; CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
+  %vqmovn1.i.i = tail call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %b) #4
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %vqmovn1.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuffle.i
+}
+
+define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
+  %1 = sext <8 x i8> %a to <8 x i16>
+  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %vshll_n
+}
+
+define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
+  %1 = sext <4 x i16> %a to <4 x i32>
+  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %vshll_n
+}
+
+define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
+  %1 = sext <2 x i32> %a to <2 x i64>
+  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
+  ret <2 x i64> %vshll_n
+}
+
+define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
+  %1 = zext <8 x i8> %a to <8 x i16>
+  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %vshll_n
+}
+
+define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
+  %1 = zext <4 x i16> %a to <4 x i32>
+  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %vshll_n
+}
+
+define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+; CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
+  %1 = zext <2 x i32> %a to <2 x i64>
+  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
+  ret <2 x i64> %vshll_n
+}
+
+define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = sext <8 x i8> %shuffle.i to <8 x i16>
+  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %vshll_n
+}
+
+define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = sext <4 x i16> %shuffle.i to <4 x i32>
+  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %vshll_n
+}
+
+define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = sext <2 x i32> %shuffle.i to <2 x i64>
+  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
+  ret <2 x i64> %vshll_n
+}
+
+define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %1 = zext <8 x i8> %shuffle.i to <8 x i16>
+  %vshll_n = shl <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %vshll_n
+}
+
+define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %1 = zext <4 x i16> %shuffle.i to <4 x i32>
+  %vshll_n = shl <4 x i32> %1, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %vshll_n
+}
+
+define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
+; CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %1 = zext <2 x i32> %shuffle.i to <2 x i64>
+  %vshll_n = shl <2 x i64> %1, <i64 32, i64 32>
+  ret <2 x i64> %vshll_n
+}
+
+define <4 x i16> @test_vcvt_f16_f32(<4 x float> %a) #0 {
+; CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
+  %vcvt1.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a) #4
+  ret <4 x i16> %vcvt1.i
+}
+
+define <8 x i16> @test_vcvt_high_f16_f32(<4 x i16> %a, <4 x float> %b) #0 {
+; CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
+  %vcvt1.i.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %b) #4
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %vcvt1.i.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %shuffle.i
+}
+
+define <4 x float> @test_vcvt_f32_f16(<4 x i16> %a) #0 {
+; CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h
+  %vcvt1.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %a) #4
+  ret <4 x float> %vcvt1.i
+}
+
+define <4 x float> @test_vcvt_high_f32_f16(<8 x i16> %a) #0 {
+; CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
+  %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vcvt1.i.i = tail call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %shuffle.i.i) #4
+  ret <4 x float> %vcvt1.i.i
+}
+
+define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 {
+; CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vcvt.i = fptrunc <2 x double> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+; CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
+  %vcvt.i.i = fptrunc <2 x double> %b to <2 x float>
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvt.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 {
+; CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
+  %vcvtx_f32_f641.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %a) #4
+  ret <2 x float> %vcvtx_f32_f641.i
+}
+
+define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+; CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
+  %vcvtx_f32_f641.i.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %b) #4
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %vcvtx_f32_f641.i.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 {
+; CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
+  %vcvt.i = fpext <2 x float> %a to <2 x double>
+  ret <2 x double> %vcvt.i
+}
+
+define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 {
+; CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
+  %shuffle.i.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %vcvt.i.i = fpext <2 x float> %shuffle.i.i to <2 x double>
+  ret <2 x double> %vcvt.i.i
+}
+
+define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
+; CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrndn1.i = tail call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrndn1.i
+}
+
+define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
+; CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrndn1.i = tail call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrndn1.i
+}
+
+define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 {
+; CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrndn1.i = tail call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrndn1.i
+}
+
+define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
+; CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrnda1.i = tail call <2 x float> @llvm.round.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrnda1.i
+}
+
+define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
+; CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+   %vrnda1.i = tail call <4 x float> @llvm.round.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrnda1.i
+}
+
+define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
+; CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrnda1.i = tail call <2 x double> @llvm.round.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrnda1.i
+}
+
+define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
+; CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrndp1.i = tail call <2 x float> @llvm.ceil.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrndp1.i
+}
+
+define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
+; CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+ %vrndp1.i = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrndp1.i
+}
+
+define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
+; CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrndp1.i = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrndp1.i
+}
+
+define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
+; CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrndm1.i = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrndm1.i
+}
+
+define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
+; CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrndm1.i = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrndm1.i
+}
+
+define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 {
+; CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+   %vrndm1.i = tail call <2 x double> @llvm.floor.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrndm1.i
+}
+
+define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
+; CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrndx1.i = tail call <2 x float> @llvm.rint.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrndx1.i
+}
+
+define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
+; CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrndx1.i = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrndx1.i
+}
+
+define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 {
+; CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrndx1.i = tail call <2 x double> @llvm.rint.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrndx1.i
+}
+
+define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
+; CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+   %vrnd1.i = tail call <2 x float> @llvm.trunc.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrnd1.i
+}
+
+define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
+; CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrnd1.i = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrnd1.i
+}
+
+define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 {
+; CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrnd1.i = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrnd1.i
+}
+
+define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 {
+; CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrndi1.i = tail call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrndi1.i
+}
+
+define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 {
+; CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrndi1.i = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrndi1.i
+}
+
+define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 {
+; CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrndi1.i = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrndi1.i
+}
+
+define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
+define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = fptosi <2 x double> %a to <2 x i64>
+  ret <2 x i64> %vcvt.i
+}
+
+define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
+define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = fptoui <2 x double> %a to <2 x i64>
+  ret <2 x i64> %vcvt.i
+}
+
+define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtns_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtns_f321.i
+}
+
+define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtns_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtns_f321.i
+}
+
+define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtns_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtns_f641.i
+}
+
+define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtnu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtnu_f321.i
+}
+
+define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtnu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtnu_f321.i
+}
+
+define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtnu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtnu_f641.i
+}
+
+define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtps_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtps_f321.i
+}
+
+define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtps_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtps_f321.i
+}
+
+define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtps_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtps_f641.i
+}
+
+define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtpu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtpu_f321.i
+}
+
+define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtpu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtpu_f321.i
+}
+
+define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtpu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtpu_f641.i
+}
+
+define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtms_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtms_f321.i
+}
+
+define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtms_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtms_f321.i
+}
+
+define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtms_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtms_f641.i
+}
+
+define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtmu_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtmu_f321.i
+}
+
+define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtmu_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtmu_f321.i
+}
+
+define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtmu_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtmu_f641.i
+}
+
+define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtas_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtas_f321.i
+}
+
+define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtas_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtas_f321.i
+}
+
+define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtas_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtas_f641.i
+}
+
+define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
+; CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvtau_f321.i = tail call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %a) #4
+  ret <2 x i32> %vcvtau_f321.i
+}
+
+define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
+; CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvtau_f321.i = tail call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %a) #4
+  ret <4 x i32> %vcvtau_f321.i
+}
+
+define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) #0 {
+; CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvtau_f641.i = tail call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %a) #4
+  ret <2 x i64> %vcvtau_f641.i
+}
+
+define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
+; CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrsqrte1.i = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrsqrte1.i
+}
+
+define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
+; CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrsqrte1.i = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrsqrte1.i
+}
+
+define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 {
+; CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrsqrte1.i = tail call <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrsqrte1.i
+}
+
+define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
+; CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrecpe1.i = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vrecpe1.i
+}
+
+define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
+; CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrecpe1.i = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vrecpe1.i
+}
+
+define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 {
+; CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vrecpe1.i = tail call <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vrecpe1.i
+}
+
+define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
+; CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vrecpe1.i = tail call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a) #4
+  ret <2 x i32> %vrecpe1.i
+}
+
+define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
+; CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vrecpe1.i = tail call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a) #4
+  ret <4 x i32> %vrecpe1.i
+}
+
+define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 {
+; CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vsqrt1.i = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) #4
+  ret <2 x float> %vsqrt1.i
+}
+
+define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
+; CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vsqrt1.i = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) #4
+  ret <4 x float> %vsqrt1.i
+}
+
+define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
+; CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vsqrt1.i = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) #4
+  ret <2 x double> %vsqrt1.i
+}
+
+define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
+; CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
+; CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
+; CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
+; CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 {
+; CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = sitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %vcvt.i
+}
+
+define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 {
+; CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
+  %vcvt.i = uitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %vcvt.i
+}
+
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #2
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) #2
+
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #2
+
+declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) #2
+
+declare <2 x double> @llvm.arm.neon.vrecpe.v2f64(<2 x double>) #2
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) #2
+
+declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) #2
+
+declare <2 x double> @llvm.arm.neon.vrsqrte.v2f64(<2 x double>) #2
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) #2
+
+declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) #2
+
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.round.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.round.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.round.v2f32(<2 x float>) #3
+
+declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) #2
+
+declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) #2
+
+declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) #2
+
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) #2
+
+declare <2 x float> @llvm.aarch64.neon.fcvtn.v2f32.v2f64(<2 x double>) #2
+
+declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) #2
+
+declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) #2
+
+declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) #2
+
+declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) #2
+
+declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) #2
+
+declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) #2
+
+declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) #2
+
+declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) #2
+
+declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) #2
+
+declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) #2
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>) #2
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #2
+
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) #2
+
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) #2
+
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) #2
+
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) #2
+
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) #2
+
+declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) #2
+
+declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) #2
+
+declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) #2
+
+declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) #2
+
+declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) #2
+
+declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) #2
+
+declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) #2
+
+declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) #2
+
+declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) #2
+
+declare <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) #2
+
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #3
+
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #3
+
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #3
+
+declare <2 x i64> @llvm.arm.neon.vabs.v2i64(<2 x i64>) #2
+
+declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) #2
+
+declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) #2
+
+declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vqneg.v2i64(<2 x i64>) #2
+
+declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) #2
+
+declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) #2
+
+declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vqabs.v2i64(<2 x i64>) #2
+
+declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) #2
+
+declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) #2
+
+declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) #2
+
+declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
+
+declare <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
+
+declare <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64>, <4 x i32>) #2
+
+declare <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32>, <8 x i16>) #2
+
+declare <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16>, <16 x i8>) #2
+
+declare <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
+
+declare <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64>, <2 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32>, <4 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16>, <8 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) #2
+
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) #2
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) #2
+
+declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) #2
+
+declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) #2
+
+declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) #2
+
+declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) #2
+
+declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) #2
+
+declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) #2
+
+declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) #2
+
+declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) #2
+
+
+define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_s64_f64
+; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fptosi <1 x double> %a to <1 x i64>
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_u64_f64
+; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = fptoui <1 x double> %a to <1 x i64>
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtn_s64_f64
+; CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtn_u64_f64
+; CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtp_s64_f64
+; CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtp_u64_f64
+; CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtm_s64_f64
+; CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvtm_u64_f64
+; CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvta_s64_f64
+; CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvta_u64_f64
+; CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %a)
+  ret <1 x i64> %1
+}
+
+define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_f64_s64
+; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = sitofp <1 x i64> %a to <1 x double>
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_f64_u64
+; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = uitofp <1 x i64> %a to <1 x double>
+  ret <1 x double> %1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>)
+
+define <1 x double> @test_vrndn_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrndn_f64
+; CHECK: frintn d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrnda_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrnda_f64
+; CHECK: frinta d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.round.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrndp_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrndp_f64
+; CHECK: frintp d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.ceil.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrndm_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrndm_f64
+; CHECK: frintm d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.floor.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrndx_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrndx_f64
+; CHECK: frintx d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.rint.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrnd_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrnd_f64
+; CHECK: frintz d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.trunc.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrndi_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrndi_f64
+; CHECK: frinti d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+declare <1 x double> @llvm.nearbyint.v1f64(<1 x double>)
+declare <1 x double> @llvm.trunc.v1f64(<1 x double>)
+declare <1 x double> @llvm.rint.v1f64(<1 x double>)
+declare <1 x double> @llvm.floor.v1f64(<1 x double>)
+declare <1 x double> @llvm.ceil.v1f64(<1 x double>)
+declare <1 x double> @llvm.round.v1f64(<1 x double>)
+declare <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double>)
+
+define <1 x double> @test_vrsqrte_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrsqrte_f64
+; CHECK: frsqrte d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrecpe_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vrecpe_f64
+; CHECK: frecpe d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vsqrt_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vsqrt_f64
+; CHECK: fsqrt d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vrecps_f64
+; CHECK: frecps d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) {
+; CHECK-LABEL: test_vrsqrts_f64
+; CHECK: frsqrts d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+  %1 = tail call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %1
+}
+
+declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.sqrt.v1f64(<1 x double>)
+declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>)
+declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll
index 42f6a89..60b13b8 100644
--- a/test/CodeGen/AArch64/neon-mov.ll
+++ b/test/CodeGen/AArch64/neon-mov.ll
@@ -202,4 +202,16 @@ define <2 x double> @fmov2d() {
 	ret <2 x double> < double -1.2e1, double -1.2e1>
 }
 
+define <2 x i32> @movi1d_1() {
+; CHECK: movi    d0, #0xffffffff0000
+  ret <2 x i32> < i32  -65536, i32 65535>
+}
+
+
+declare <2 x i32> @test_movi1d(<2 x i32>, <2 x i32>)
+define <2 x i32> @movi1d() {
+; CHECK: movi     d1, #0xffffffff0000
+  %1 = tail call <2 x i32> @test_movi1d(<2 x i32> <i32 -2147483648, i32 2147450880>, <2 x i32> <i32 -65536, i32 65535>)
+  ret <2 x i32> %1
+}
 
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
new file mode 100644
index 0000000..fa4d54d
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -0,0 +1,1693 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.uint8x8x2_t = type { [2 x <8 x i8>] }
+%struct.uint16x4x2_t = type { [2 x <4 x i16>] }
+%struct.uint32x2x2_t = type { [2 x <2 x i32>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.poly8x8x2_t = type { [2 x <8 x i8>] }
+%struct.poly16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.uint8x16x2_t = type { [2 x <16 x i8>] }
+%struct.uint16x8x2_t = type { [2 x <8 x i16>] }
+%struct.uint32x4x2_t = type { [2 x <4 x i32>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.poly8x16x2_t = type { [2 x <16 x i8>] }
+%struct.poly16x8x2_t = type { [2 x <8 x i16>] }
+
+define <8 x i8> @test_vuzp1_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp1_s8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp1q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp1q_s8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp1_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp1_s16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp1q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp1q_s16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp1_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vuzp1q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzp1q_s32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vuzp1q_s64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vuzp1_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp1_u8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp1q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp1q_u8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp1_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp1_u16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp1q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp1q_u16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp1_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vuzp1q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzp1q_u32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vuzp1q_u64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vuzp1_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vuzp1q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vuzp1q_f32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vuzp1q_f64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vuzp1_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp1_p8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp1q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp1q_p8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp1_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp1_p16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp1q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp1q_p16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vuzp2_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp2_s8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp2q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp2q_s8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp2_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp2_s16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp2q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp2q_s16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp2_s32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vuzp2q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzp2q_s32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vuzp2q_s64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vuzp2_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp2_u8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp2q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp2q_u8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp2_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp2_u16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp2q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp2q_u16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp2_u32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vuzp2q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzp2q_u32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vuzp2q_u64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vuzp2_f32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vuzp2q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vuzp2q_f32:
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vuzp2q_f64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+; CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vuzp2_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp2_p8:
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vuzp2q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzp2q_p8:
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vuzp2_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp2_p16:
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vuzp2q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzp2q_p16:
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vzip1_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip1_s8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip1q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip1q_s8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip1_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip1_s16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip1q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip1q_s16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip1_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vzip1q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzip1q_s32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vzip1q_s64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vzip1_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip1_u8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip1q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip1q_u8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip1_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip1_u16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip1q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip1q_u16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip1_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vzip1q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzip1q_u32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vzip1q_u64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vzip1_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vzip1q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vzip1q_f32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vzip1q_f64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vzip1_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip1_p8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip1q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip1q_p8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip1_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip1_p16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip1q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip1q_p16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vzip2_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip2_s8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip2q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip2q_s8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip2_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip2_s16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip2q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip2q_s16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip2_s32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vzip2q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzip2q_s32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vzip2q_s64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vzip2_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip2_u8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip2q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip2q_u8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip2_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip2_u16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip2q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip2q_u16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip2_u32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vzip2q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzip2q_u32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vzip2q_u64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vzip2_f32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vzip2q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vzip2q_f32:
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vzip2q_f64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vzip2_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip2_p8:
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vzip2q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzip2q_p8:
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vzip2_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip2_p16:
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vzip2q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzip2q_p16:
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn1_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn1_s8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn1q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn1q_s8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn1_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn1_s16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn1q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn1q_s16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn1_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vtrn1q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrn1q_s32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vtrn1q_s64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn1_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn1_u8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn1q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn1q_u8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn1_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn1_u16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn1q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn1q_u16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn1_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vtrn1q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrn1q_u32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vtrn1q_u64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vtrn1_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vtrn1q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vtrn1q_f32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vtrn1q_f64:
+; CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn1_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn1_p8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn1q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn1q_p8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn1_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn1_p16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn1q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn1q_p16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x i16> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn2_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn2_s8:
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn2q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn2q_s8:
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn2_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn2_s16:
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn2q_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn2q_s16:
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn2_s32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vtrn2q_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrn2q_s32:
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vtrn2q_s64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn2_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn2_u8:
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn2q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn2q_u8:
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn2_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn2_u16:
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn2q_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn2q_u16:
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn2_u32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <4 x i32> @test_vtrn2q_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrn2q_u32:
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i32> %shuffle.i
+}
+
+define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vtrn2q_u64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x i64> %shuffle.i
+}
+
+define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vtrn2_f32:
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x float> %shuffle.i
+}
+
+define <4 x float> @test_vtrn2q_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vtrn2q_f32:
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) {
+; CHECK: test_vtrn2q_f64:
+; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vtrn2_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn2_p8:
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %shuffle.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <16 x i8> @test_vtrn2q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrn2q_p8:
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ret <16 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vtrn2_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn2_p16:
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %shuffle.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <8 x i16> @test_vtrn2q_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrn2q_p16:
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x i16> %shuffle.i
+}
+
+define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp_s8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp_s16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vuzp1.i, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vuzp1.i, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp_u8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
+  ret %struct.uint8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp_u16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint16x4x2_t undef, <4 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x4x2_t %.fca.0.0.insert, <4 x i16> %vuzp1.i, 0, 1
+  ret %struct.uint16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vuzp_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.uint32x2x2_t undef, <2 x i32> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x2x2_t %.fca.0.0.insert, <2 x i32> %vuzp1.i, 0, 1
+  ret %struct.uint32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vuzp_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vuzp.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  %vuzp1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vuzp1.i, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vuzp_p8:
+; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vuzp.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
+  ret %struct.poly8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vuzp_p16:
+; CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vuzp.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.poly16x4x2_t undef, <4 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x4x2_t %.fca.0.0.insert, <4 x i16> %vuzp1.i, 0, 1
+  ret %struct.poly16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzpq_s8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vuzp1.i, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzpq_s16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vuzp1.i, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzpq_s32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vuzp.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vuzp1.i, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzpq_u8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vuzp1.i, 0, 1
+  ret %struct.uint8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzpq_u16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint16x8x2_t undef, <8 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x8x2_t %.fca.0.0.insert, <8 x i16> %vuzp1.i, 0, 1
+  ret %struct.uint16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vuzpq_u32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vuzp.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint32x4x2_t undef, <4 x i32> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x4x2_t %.fca.0.0.insert, <4 x i32> %vuzp1.i, 0, 1
+  ret %struct.uint32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vuzpq_f32:
+; CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vuzp.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %vuzp1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vuzp1.i, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vuzpq_p8:
+; CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vuzp.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %vuzp1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vuzp1.i, 0, 1
+  ret %struct.poly8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vuzpq_p16:
+; CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vuzp.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly16x8x2_t undef, <8 x i16> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %vuzp1.i, 0, 1
+  ret %struct.poly16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip_s8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vzip1.i, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip_s16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vzip1.i, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vzip1.i, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip_u8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vzip1.i, 0, 1
+  ret %struct.uint8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip_u16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint16x4x2_t undef, <4 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x4x2_t %.fca.0.0.insert, <4 x i16> %vzip1.i, 0, 1
+  ret %struct.uint16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vzip_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.uint32x2x2_t undef, <2 x i32> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x2x2_t %.fca.0.0.insert, <2 x i32> %vzip1.i, 0, 1
+  ret %struct.uint32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vzip_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vzip.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  %vzip1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vzip1.i, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vzip_p8:
+; CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vzip.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly8x8x2_t undef, <8 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x8x2_t %.fca.0.0.insert, <8 x i8> %vzip1.i, 0, 1
+  ret %struct.poly8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vzip_p16:
+; CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vzip.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.poly16x4x2_t undef, <4 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x4x2_t %.fca.0.0.insert, <4 x i16> %vzip1.i, 0, 1
+  ret %struct.poly16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzipq_s8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vzip1.i, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzipq_s16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vzip1.i, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzipq_s32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vzip.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vzip1.i, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzipq_u8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vzip1.i, 0, 1
+  ret %struct.uint8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzipq_u16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint16x8x2_t undef, <8 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x8x2_t %.fca.0.0.insert, <8 x i16> %vzip1.i, 0, 1
+  ret %struct.uint16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vzipq_u32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vzip.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint32x4x2_t undef, <4 x i32> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x4x2_t %.fca.0.0.insert, <4 x i32> %vzip1.i, 0, 1
+  ret %struct.uint32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vzipq_f32:
+; CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vzip.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %vzip1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vzip1.i, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vzipq_p8:
+; CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vzip.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  %vzip1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vzip1.i, 0, 1
+  ret %struct.poly8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vzipq_p16:
+; CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly16x8x2_t undef, <8 x i16> %vzip.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %vzip1.i, 0, 1
+  ret %struct.poly16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn_s8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vtrn1.i, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn_s16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vtrn1.i, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn_s32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vtrn1.i, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn_u8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vtrn1.i, 0, 1
+  ret %struct.uint8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn_u16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint16x4x2_t undef, <4 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x4x2_t %.fca.0.0.insert, <4 x i16> %vtrn1.i, 0, 1
+  ret %struct.uint16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vtrn_u32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+  %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.uint32x2x2_t undef, <2 x i32> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x2x2_t %.fca.0.0.insert, <2 x i32> %vtrn1.i, 0, 1
+  ret %struct.uint32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) {
+; CHECK: test_vtrn_f32:
+; CHECK: ins {{v[0-9]+}}.s[1], {{v[0-9]+}}.s[0]
+; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+entry:
+  %vtrn.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+  %vtrn1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vtrn1.i, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtrn_p8:
+; CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+entry:
+  %vtrn.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly8x8x2_t undef, <8 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x8x2_t %.fca.0.0.insert, <8 x i8> %vtrn1.i, 0, 1
+  ret %struct.poly8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vtrn_p16:
+; CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+entry:
+  %vtrn.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.poly16x4x2_t undef, <4 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x4x2_t %.fca.0.0.insert, <4 x i16> %vtrn1.i, 0, 1
+  ret %struct.poly16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrnq_s8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vtrn1.i, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrnq_s16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vtrn1.i, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrnq_s32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vtrn.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vtrn1.i, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrnq_u8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vtrn1.i, 0, 1
+  ret %struct.uint8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrnq_u16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint16x8x2_t undef, <8 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint16x8x2_t %.fca.0.0.insert, <8 x i16> %vtrn1.i, 0, 1
+  ret %struct.uint16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vtrnq_u32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vtrn.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.uint32x4x2_t undef, <4 x i32> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint32x4x2_t %.fca.0.0.insert, <4 x i32> %vtrn1.i, 0, 1
+  ret %struct.uint32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK: test_vtrnq_f32:
+; CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+entry:
+  %vtrn.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %vtrn1.i = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vtrn1.i, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vtrnq_p8:
+; CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+entry:
+  %vtrn.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  %vtrn1.i = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vtrn1.i, 0, 1
+  ret %struct.poly8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vtrnq_p16:
+; CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+entry:
+  %vtrn.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %vtrn1.i = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.poly16x8x2_t undef, <8 x i16> %vtrn.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %vtrn1.i, 0, 1
+  ret %struct.poly16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
+; CHECK: test_uzp:
+
+  %vuzp.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %vuzp1.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
+  ret %struct.uint8x8x2_t %.fca.0.1.insert
+
+; CHECK: dup	{{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: uzp1	{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: uzp2	{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}
diff --git a/test/CodeGen/AArch64/neon-rounding-shift.ll b/test/CodeGen/AArch64/neon-rounding-shift.ll
index 404e491..5b4ec28 100644
--- a/test/CodeGen/AArch64/neon-rounding-shift.ll
+++ b/test/CodeGen/AArch64/neon-rounding-shift.ll
@@ -102,23 +102,6 @@ define <4 x i32> @test_srshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
   ret <4 x i32> %tmp1
 }
 
-declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_urshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: urshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_srshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: srshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
 declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>)
 declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>)
 
diff --git a/test/CodeGen/AArch64/neon-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-saturating-add-sub.ll
index b2fac1f..fc60d90 100644
--- a/test/CodeGen/AArch64/neon-saturating-add-sub.ll
+++ b/test/CodeGen/AArch64/neon-saturating-add-sub.ll
@@ -102,22 +102,7 @@ define <4 x i32> @test_sqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
   ret <4 x i32> %tmp1
 }
 
-declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqadd_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: uqadd d0, d0, d1
-  ret <1 x i64> %tmp1
-}
 
-define <1 x i64> @test_sqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqadd_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sqadd d0, d0, d1
-  ret <1 x i64> %tmp1
-}
 
 declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>)
 declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
@@ -254,21 +239,3 @@ define <2 x i64> @test_sqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 ; CHECK: sqsub v0.2d, v0.2d, v1.2d
   ret <2 x i64> %tmp1
 }
-
-declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqsub_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: uqsub d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqsub_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sqsub d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
diff --git a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
index 05d8dfe..d89262c 100644
--- a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
+++ b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
@@ -102,23 +102,6 @@ define <4 x i32> @test_sqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
   ret <4 x i32> %tmp1
 }
 
-declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqrshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: uqrshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqrshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sqrshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
 declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>)
 declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>)
 
diff --git a/test/CodeGen/AArch64/neon-saturating-shift.ll b/test/CodeGen/AArch64/neon-saturating-shift.ll
index 3b7f78c..11009fb 100644
--- a/test/CodeGen/AArch64/neon-saturating-shift.ll
+++ b/test/CodeGen/AArch64/neon-saturating-shift.ll
@@ -102,23 +102,6 @@ define <4 x i32> @test_sqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
   ret <4 x i32> %tmp1
 }
 
-declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_uqshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: uqshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sqshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sqshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
 declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
 declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
 
diff --git a/test/CodeGen/AArch64/neon-scalar-abs.ll b/test/CodeGen/AArch64/neon-scalar-abs.ll
new file mode 100644
index 0000000..03a89e04
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-abs.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define i64 @test_vabsd_s64(i64 %a) {
+; CHECK: test_vabsd_s64
+; CHECK: abs {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vabs1.i = tail call <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64> %vabs.i)
+  %0 = extractelement <1 x i64> %vabs1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vabs(<1 x i64>)
+
+define i8 @test_vqabsb_s8(i8 %a) {
+; CHECK: test_vqabsb_s8
+; CHECK: sqabs {{b[0-9]+}}, {{b[0-9]+}}
+entry:
+  %vqabs.i = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vqabs1.i = call <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8> %vqabs.i)
+  %0 = extractelement <1 x i8> %vqabs1.i, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.arm.neon.vqabs.v1i8(<1 x i8>)
+
+define i16 @test_vqabsh_s16(i16 %a) {
+; CHECK: test_vqabsh_s16
+; CHECK: sqabs {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqabs.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqabs1.i = call <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16> %vqabs.i)
+  %0 = extractelement <1 x i16> %vqabs1.i, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.arm.neon.vqabs.v1i16(<1 x i16>)
+
+define i32 @test_vqabss_s32(i32 %a) {
+; CHECK: test_vqabss_s32
+; CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqabs.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqabs1.i = call <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32> %vqabs.i)
+  %0 = extractelement <1 x i32> %vqabs1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.arm.neon.vqabs.v1i32(<1 x i32>)
+
+define i64 @test_vqabsd_s64(i64 %a) {
+; CHECK: test_vqabsd_s64
+; CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vqabs.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqabs1.i = call <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64> %vqabs.i)
+  %0 = extractelement <1 x i64> %vqabs1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.arm.neon.vqabs.v1i64(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-add-sub.ll
new file mode 100644
index 0000000..09ca880
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-add-sub.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+	%tmp3 = add <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+	%tmp3 = sub <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_add_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_add_v1i64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vaddds(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_uadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uadd_v1i64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vadddu(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_sub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sub_v1i64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubds(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_usub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_usub_v1i64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vsubdu(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+
+
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
new file mode 100644
index 0000000..8ce42de
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -0,0 +1,108 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+declare float @llvm.fma.f32(float, float, float)
+declare double @llvm.fma.f64(double, double, double)
+
+define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
+  ; CHECK: test_fmla_ss4S
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) {
+  ; CHECK: test_fmla_ss4S_swap
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) {
+  ; CHECK: test_fmla_ss2S
+  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) {
+  ; CHECK: test_fmla_ddD
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[0]
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) {
+  ; CHECK: test_fmla_dd2D
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) {
+  ; CHECK: test_fmla_dd2D_swap
+  ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
+  ret double %tmp2
+}
+
+define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
+  ; CHECK: test_fmls_ss4S
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a)
+  ret float %tmp3
+}
+
+define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
+  ; CHECK: test_fmls_ss4S_swap
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = call float @llvm.fma.f32(float %tmp1, float %tmp2, float %a)
+  ret float %tmp3
+}
+
+
+define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
+  ; CHECK: test_fmls_ss2S
+  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-31]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fsub float -0.0, %tmp1
+  %tmp3 = call float @llvm.fma.f32(float %tmp2, float %tmp1, float %a)
+  ret float %tmp3
+}
+
+define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) {
+  ; CHECK: test_fmls_ddD
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[0]
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = fsub double -0.0, %tmp1
+  %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
+  ret double %tmp3
+}
+
+define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
+  ; CHECK: test_fmls_dd2D
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fsub double -0.0, %tmp1
+  %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
+  ret double %tmp3
+}
+
+define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) {
+  ; CHECK: test_fmls_dd2D_swap
+  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fsub double -0.0, %tmp1
+  %tmp3 = call double @llvm.fma.f64(double %tmp1, double %tmp2, double %a)
+  ret double %tmp3
+}
+
diff --git a/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
new file mode 100644
index 0000000..968ad3e
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
@@ -0,0 +1,124 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
+  ; CHECK: test_fmul_lane_ss2S
+  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fmul float %a, %tmp1;
+  ret float %tmp2;
+}
+
+define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
+  ; CHECK: test_fmul_lane_ss2S_swap
+  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = fmul float %tmp1, %a;
+  ret float %tmp2;
+}
+
+
+define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
+  ; CHECK: test_fmul_lane_ss4S
+  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fmul float %a, %tmp1;
+  ret float %tmp2;
+}
+
+define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
+  ; CHECK: test_fmul_lane_ss4S_swap
+  ; CHECK: fmul {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = fmul float %tmp1, %a;
+  ret float %tmp2;
+}
+
+
+define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
+  ; CHECK: test_fmul_lane_ddD
+  ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = fmul double %a, %tmp1;
+  ret double %tmp2;
+}
+
+
+
+define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
+  ; CHECK: test_fmul_lane_dd2D
+  ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fmul double %a, %tmp1;
+  ret double %tmp2;
+}
+
+
+define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
+  ; CHECK: test_fmul_lane_dd2D_swap
+  ; CHECK: fmul {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = fmul double %tmp1, %a;
+  ret double %tmp2;
+}
+
+declare float @llvm.aarch64.neon.vmulx.f32(float, float)
+
+define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
+  ; CHECK: test_fmulx_lane_f32
+  ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+  %tmp1 = extractelement <2 x float> %v, i32 1
+  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
+  ret float %tmp2;
+}
+
+define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
+  ; CHECK: test_fmulx_laneq_f32
+  ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %tmp1)
+  ret float %tmp2;
+}
+
+define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
+  ; CHECK: test_fmulx_laneq_f32_swap
+  ; CHECK: fmulx {{s[0-31]+}}, {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+  %tmp1 = extractelement <4 x float> %v, i32 3
+  %tmp2 = call float @llvm.aarch64.neon.vmulx.f32(float %tmp1, float %a)
+  ret float %tmp2;
+}
+
+declare double @llvm.aarch64.neon.vmulx.f64(double, double)
+
+define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
+  ; CHECK: test_fmulx_lane_f64
+  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
+  ; CHECK: test_fmulx_laneq_f64_0
+  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+  %tmp1 = extractelement <2 x double> %v, i32 0
+  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+
+define double @test_fmulx_laneq_f64_1(double %a, <2 x double> %v) {
+  ; CHECK: test_fmulx_laneq_f64_1
+  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %tmp1)
+  ret double %tmp2;
+}
+
+define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
+  ; CHECK: test_fmulx_laneq_f64_1_swap
+  ; CHECK: fmulx {{d[0-31]+}}, {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  %tmp1 = extractelement <2 x double> %v, i32 1
+  %tmp2 = call double @llvm.aarch64.neon.vmulx.f64(double %tmp1, double %a)
+  ret double %tmp2;
+}
+
diff --git a/test/CodeGen/AArch64/neon-scalar-compare.ll b/test/CodeGen/AArch64/neon-scalar-compare.ll
new file mode 100644
index 0000000..5f10cbb
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-compare.ll
@@ -0,0 +1,343 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+;; Scalar Integer Compare
+
+define i64 @test_vceqd(i64 %a, i64 %b) {
+; CHECK: test_vceqd
+; CHECK: cmeq {{d[0-9]+}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vceq.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vceq1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceq.i, <1 x i64> %vceq1.i)
+  %0 = extractelement <1 x i64> %vceq2.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vceqzd(i64 %a) {
+; CHECK: test_vceqzd
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
+entry:
+  %vceqz.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vceqz1.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64> %vceqz.i, <1 x i64> zeroinitializer)
+  %0 = extractelement <1 x i64> %vceqz1.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcged(i64 %a, i64 %b) {
+; CHECK: test_vcged
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcge1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
+  %0 = extractelement <1 x i64> %vcge2.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcgezd(i64 %a) {
+; CHECK: test_vcgezd
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
+entry:
+  %vcgez.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcgez1.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcgez.i, <1 x i64> zeroinitializer)
+  %0 = extractelement <1 x i64> %vcgez1.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcgtd(i64 %a, i64 %b) {
+; CHECK: test_vcgtd
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcgt1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
+  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcgtzd(i64 %a) {
+; CHECK: test_vcgtzd
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
+entry:
+  %vcgtz.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcgtz1.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgtz.i, <1 x i64> zeroinitializer)
+  %0 = extractelement <1 x i64> %vcgtz1.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcled(i64 %a, i64 %b) {
+; CHECK: test_vcled
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vcgt1.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64> %vcgt.i, <1 x i64> %vcgt1.i)
+  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vclezd(i64 %a) {
+; CHECK: test_vclezd
+; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
+entry:
+  %vclez.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vclez1.i = call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64> %vclez.i, <1 x i64> zeroinitializer)
+  %0 = extractelement <1 x i64> %vclez1.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcltd(i64 %a, i64 %b) {
+; CHECK: test_vcltd
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vcge1.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64> %vcge.i, <1 x i64> %vcge1.i)
+  %0 = extractelement <1 x i64> %vcge2.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vcltzd(i64 %a) {
+; CHECK: test_vcltzd
+; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0x0
+entry:
+  %vcltz.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vcltz1.i = call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64> %vcltz.i, <1 x i64> zeroinitializer)
+  %0 = extractelement <1 x i64> %vcltz1.i, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vtstd(i64 %a, i64 %b) {
+; CHECK: test_vtstd
+; CHECK: cmtst {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vtst.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vtst1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vtst2.i = call <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64> %vtst.i, <1 x i64> %vtst1.i)
+  %0 = extractelement <1 x i64> %vtst2.i, i32 0
+  ret i64 %0
+}
+
+
+define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcage_f64
+; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %vcage2.i = tail call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %a, <1 x double> %b) #2
+  ret <1 x i64> %vcage2.i
+}
+
+define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcagt_f64
+; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %vcagt2.i = tail call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %a, <1 x double> %b) #2
+  ret <1 x i64> %vcagt2.i
+}
+
+define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcale_f64
+; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %vcage2.i = tail call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %b, <1 x double> %a) #2
+  ret <1 x i64> %vcage2.i
+}
+
+define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcalt_f64
+; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %vcagt2.i = tail call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %b, <1 x double> %a) #2
+  ret <1 x i64> %vcagt2.i
+}
+
+define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vceq_s64
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp eq <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vceq_u64
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp eq <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vceq_f64
+; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = fcmp oeq <1 x double> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcge_s64
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp sge <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcge_u64
+; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp uge <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcge_f64
+; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = fcmp oge <1 x double> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcle_s64
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp sle <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcle_u64
+; CHECK: cmhs {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp ule <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcle_f64
+; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = fcmp ole <1 x double> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcgt_s64
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp sgt <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vcgt_u64
+; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp ugt <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vcgt_f64
+; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = fcmp ogt <1 x double> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vclt_s64
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp slt <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+; CHECK: test_vclt_u64
+; CHECK: cmhi {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = icmp ult <1 x i64> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK: test_vclt_f64
+; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+  %cmp.i = fcmp olt <1 x double> %a, %b
+  %sext.i = sext <1 x i1> %cmp.i to <1 x i64>
+  ret <1 x i64> %sext.i
+}
+
+define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 {
+; CHECK: test_vceqz_s64
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp eq <1 x i64> %a, zeroinitializer
+  %vceqz.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vceqz.i
+}
+
+define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 {
+; CHECK: test_vceqz_u64
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp eq <1 x i64> %a, zeroinitializer
+  %vceqz.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vceqz.i
+}
+
+define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 {
+; CHECK: test_vceqz_p64
+; CHECK: cmeq {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp eq <1 x i64> %a, zeroinitializer
+  %vceqz.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vceqz.i
+}
+
+define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 {
+; CHECK: test_vceqzq_p64
+; CHECK: cmeq  {{v[0-9]}}.2d, {{v[0-9]}}.2d, #0
+  %1 = icmp eq <2 x i64> %a, zeroinitializer
+  %vceqz.i = zext <2 x i1> %1 to <2 x i64>
+  ret <2 x i64> %vceqz.i
+}
+
+define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 {
+; CHECK: test_vcgez_s64
+; CHECK: cmge {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp sge <1 x i64> %a, zeroinitializer
+  %vcgez.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vcgez.i
+}
+
+define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 {
+; CHECK: test_vclez_s64
+; CHECK: cmle {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp sle <1 x i64> %a, zeroinitializer
+  %vclez.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vclez.i
+}
+
+define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 {
+; CHECK: test_vcgtz_s64
+; CHECK: cmgt {{d[0-9]}}, {{d[0-9]}}, #0x0
+  %1 = icmp sgt <1 x i64> %a, zeroinitializer
+  %vcgtz.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vcgtz.i
+}
+
+define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 {
+; CHECK: test_vcltz_s64
+; CHECK: cmlt {{d[0-9]}}, {{d[0-9]}}, #0
+  %1 = icmp slt <1 x i64> %a, zeroinitializer
+  %vcltz.i = zext <1 x i1> %1 to <1 x i64>
+  ret <1 x i64> %vcltz.i
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i64> @llvm.aarch64.neon.vtstd.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vchs.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vchi.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1i64.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-copy.ll b/test/CodeGen/AArch64/neon-scalar-copy.ll
new file mode 100644
index 0000000..d433ff5
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -0,0 +1,88 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define float @test_dup_sv2S(<2 x float> %v) {
+ ;CHECK: test_dup_sv2S
+ ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+ %tmp1 = extractelement <2 x float> %v, i32 1
+ ret float  %tmp1
+}
+
+define float @test_dup_sv4S(<4 x float> %v) {
+ ;CHECK: test_dup_sv4S
+ ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[0]
+ %tmp1 = extractelement <4 x float> %v, i32 0
+ ret float  %tmp1
+}
+
+define double @test_dup_dvD(<1 x double> %v) {
+ ;CHECK: test_dup_dvD
+ ;CHECK-NOT: dup {{d[0-31]+}}, {{v[0-31]+}}.d[0]
+ ;CHECK: ret
+ %tmp1 = extractelement <1 x double> %v, i32 0
+ ret double  %tmp1
+}
+
+define double @test_dup_dv2D(<2 x double> %v) {
+ ;CHECK: test_dup_dv2D
+ ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+ %tmp1 = extractelement <2 x double> %v, i32 1
+ ret double  %tmp1
+}
+
+define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) {
+ ;CHECK: test_vector_dup_bv16B
+ ;CHECK: dup {{b[0-31]+}}, {{v[0-31]+}}.b[14]
+ %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> <i32 14> 
+ ret <1 x i8> %shuffle.i
+}
+
+define <1 x i8> @test_vector_dup_bv8B(<8 x i8> %v1) {
+ ;CHECK: test_vector_dup_bv8B
+ ;CHECK: dup {{b[0-31]+}}, {{v[0-31]+}}.b[7]
+ %shuffle.i = shufflevector <8 x i8> %v1, <8 x i8> undef, <1 x i32> <i32 7> 
+ ret <1 x i8> %shuffle.i
+}
+
+define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) {
+ ;CHECK: test_vector_dup_hv8H
+ ;CHECK: dup {{h[0-31]+}}, {{v[0-31]+}}.h[7]
+ %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> <i32 7> 
+ ret <1 x i16> %shuffle.i
+}
+
+define <1 x i16> @test_vector_dup_hv4H(<4 x i16> %v1) {
+ ;CHECK: test_vector_dup_hv4H
+ ;CHECK: dup {{h[0-31]+}}, {{v[0-31]+}}.h[3]
+ %shuffle.i = shufflevector <4 x i16> %v1, <4 x i16> undef, <1 x i32> <i32 3> 
+ ret <1 x i16> %shuffle.i
+}
+
+define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) {
+ ;CHECK: test_vector_dup_sv4S
+ ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[3]
+ %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> <i32 3> 
+ ret <1 x i32> %shuffle
+}
+
+define <1 x i32> @test_vector_dup_sv2S(<2 x i32> %v1) {
+ ;CHECK: test_vector_dup_sv2S
+ ;CHECK: dup {{s[0-31]+}}, {{v[0-31]+}}.s[1]
+ %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <1 x i32> <i32 1> 
+ ret <1 x i32> %shuffle
+}
+
+define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) {
+ ;CHECK: test_vector_dup_dv2D
+ ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+ %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> <i32 1> 
+ ret <1 x i64> %shuffle.i
+}
+
+define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) {
+  ;CHECK: test_vector_copy_dup_dv2D
+  ;CHECK: dup {{d[0-31]+}}, {{v[0-31]+}}.d[1]
+  %vget_lane = extractelement <2 x i64> %c, i32 1
+  %vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0
+  ret <1 x i64> %vset_lane
+}
+
diff --git a/test/CodeGen/AArch64/neon-scalar-cvt.ll b/test/CodeGen/AArch64/neon-scalar-cvt.ll
new file mode 100644
index 0000000..a06d5d6
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-cvt.ll
@@ -0,0 +1,137 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define float @test_vcvts_f32_s32(i32 %a) {
+; CHECK: test_vcvts_f32_s32
+; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %0 = call float @llvm.aarch64.neon.vcvtf32.s32(<1 x i32> %vcvtf.i)
+  ret float %0
+}
+
+declare float @llvm.aarch64.neon.vcvtf32.s32(<1 x i32>)
+
+define double @test_vcvtd_f64_s64(i64 %a) {
+; CHECK: test_vcvtd_f64_s64
+; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %0 = call double @llvm.aarch64.neon.vcvtf64.s64(<1 x i64> %vcvtf.i)
+  ret double %0
+}
+
+declare double @llvm.aarch64.neon.vcvtf64.s64(<1 x i64>)
+
+define float @test_vcvts_f32_u32(i32 %a) {
+; CHECK: test_vcvts_f32_u32
+; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vcvtf.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %0 = call float @llvm.aarch64.neon.vcvtf32.u32(<1 x i32> %vcvtf.i)
+  ret float %0
+}
+
+declare float @llvm.aarch64.neon.vcvtf32.u32(<1 x i32>)
+
+define double @test_vcvtd_f64_u64(i64 %a) {
+; CHECK: test_vcvtd_f64_u64
+; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vcvtf.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %0 = call double @llvm.aarch64.neon.vcvtf64.u64(<1 x i64> %vcvtf.i)
+  ret double %0
+}
+
+declare double @llvm.aarch64.neon.vcvtf64.u64(<1 x i64>)
+
+define float @test_vcvts_n_f32_s32(i32 %a) {
+; CHECK: test_vcvts_n_f32_s32
+; CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
+entry:
+  %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
+  %0 = call float @llvm.aarch64.neon.vcvtf32.n.s32(<1 x i32> %vcvtf, i32 1)
+  ret float %0
+}
+
+declare float @llvm.aarch64.neon.vcvtf32.n.s32(<1 x i32>, i32)
+
+define double @test_vcvtd_n_f64_s64(i64 %a) {
+; CHECK: test_vcvtd_n_f64_s64
+; CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
+entry:
+  %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
+  %0 = call double @llvm.aarch64.neon.vcvtf64.n.s64(<1 x i64> %vcvtf, i32 1)
+  ret double %0
+}
+
+declare double @llvm.aarch64.neon.vcvtf64.n.s64(<1 x i64>, i32)
+
+define float @test_vcvts_n_f32_u32(i32 %a) {
+; CHECK: test_vcvts_n_f32_u32
+; CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
+entry:
+  %vcvtf = insertelement <1 x i32> undef, i32 %a, i32 0
+  %0 = call float @llvm.aarch64.neon.vcvtf32.n.u32(<1 x i32> %vcvtf, i32 1)
+  ret float %0
+}
+
+declare float @llvm.aarch64.neon.vcvtf32.n.u32(<1 x i32>, i32)
+
+define double @test_vcvtd_n_f64_u64(i64 %a) {
+; CHECK: test_vcvtd_n_f64_u64
+; CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
+entry:
+  %vcvtf = insertelement <1 x i64> undef, i64 %a, i32 0
+  %0 = call double @llvm.aarch64.neon.vcvtf64.n.u64(<1 x i64> %vcvtf, i32 1)
+  ret double %0
+}
+
+declare double @llvm.aarch64.neon.vcvtf64.n.u64(<1 x i64>, i32)
+
+define i32 @test_vcvts_n_s32_f32(float %a) {
+; CHECK: test_vcvts_n_s32_f32
+; CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1
+entry:
+  %fcvtzs = insertelement <1 x float> undef, float %a, i32 0
+  %fcvtzs1 = call <1 x i32> @llvm.aarch64.neon.vcvts.n.s32.f32(<1 x float> %fcvtzs, i32 1)
+  %0 = extractelement <1 x i32> %fcvtzs1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vcvts.n.s32.f32(<1 x float>, i32)
+
+define i64 @test_vcvtd_n_s64_f64(double %a) {
+; CHECK: test_vcvtd_n_s64_f64
+; CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1
+entry:
+  %fcvtzs = insertelement <1 x double> undef, double %a, i32 0
+  %fcvtzs1 = call <1 x i64> @llvm.aarch64.neon.vcvtd.n.s64.f64(<1 x double> %fcvtzs, i32 1)
+  %0 = extractelement <1 x i64> %fcvtzs1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vcvtd.n.s64.f64(<1 x double>, i32)
+
+define i32 @test_vcvts_n_u32_f32(float %a) {
+; CHECK: test_vcvts_n_u32_f32
+; CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32
+entry:
+  %fcvtzu = insertelement <1 x float> undef, float %a, i32 0
+  %fcvtzu1 = call <1 x i32> @llvm.aarch64.neon.vcvts.n.u32.f32(<1 x float> %fcvtzu, i32 32)
+  %0 = extractelement <1 x i32> %fcvtzu1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vcvts.n.u32.f32(<1 x float>, i32)
+
+define i64 @test_vcvtd_n_u64_f64(double %a) {
+; CHECK: test_vcvtd_n_u64_f64
+; CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64
+entry:
+  %fcvtzu = insertelement <1 x double> undef, double %a, i32 0
+  %fcvtzu1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtd.n.u64.f64(<1 x double> %fcvtzu, i32 64)
+  %0 = extractelement <1 x i64> %fcvtzu1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vcvtd.n.u64.f64(<1 x double>, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
new file mode 100644
index 0000000..faf521b
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define i8 @test_vqmovunh_s16(i16 %a) {
+; CHECK: test_vqmovunh_s16
+; CHECK: sqxtun {{b[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqmovun.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqmovun1.i = call <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16> %vqmovun.i)
+  %0 = extractelement <1 x i8> %vqmovun1.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vqmovuns_s32(i32 %a) {
+; CHECK: test_vqmovuns_s32
+; CHECK: sqxtun {{h[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqmovun.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqmovun1.i = call <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32> %vqmovun.i)
+  %0 = extractelement <1 x i16> %vqmovun1.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vqmovund_s64(i64 %a) {
+; CHECK: test_vqmovund_s64
+; CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vqmovun.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqmovun1.i = call <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64> %vqmovun.i)
+  %0 = extractelement <1 x i32> %vqmovun1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i8> @llvm.arm.neon.vqmovnsu.v1i8(<1 x i16>)
+declare <1 x i16> @llvm.arm.neon.vqmovnsu.v1i16(<1 x i32>)
+declare <1 x i32> @llvm.arm.neon.vqmovnsu.v1i32(<1 x i64>)
+
+define i8 @test_vqmovnh_s16(i16 %a) {
+; CHECK: test_vqmovnh_s16
+; CHECK: sqxtn {{b[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16> %vqmovn.i)
+  %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
+  ret i8 %0
+}
+
+define i16 @test_vqmovns_s32(i32 %a) {
+; CHECK: test_vqmovns_s32
+; CHECK: sqxtn {{h[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32> %vqmovn.i)
+  %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vqmovnd_s64(i64 %a) {
+; CHECK: test_vqmovnd_s64
+; CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64> %vqmovn.i)
+  %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i8> @llvm.arm.neon.vqmovns.v1i8(<1 x i16>)
+declare <1 x i16> @llvm.arm.neon.vqmovns.v1i16(<1 x i32>)
+declare <1 x i32> @llvm.arm.neon.vqmovns.v1i32(<1 x i64>)
+
+define i8 @test_vqmovnh_u16(i16 %a) {
+; CHECK: test_vqmovnh_u16
+; CHECK: uqxtn {{b[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqmovn1.i = call <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16> %vqmovn.i)
+  %0 = extractelement <1 x i8> %vqmovn1.i, i32 0
+  ret i8 %0
+}
+
+
+define i16 @test_vqmovns_u32(i32 %a) {
+; CHECK: test_vqmovns_u32
+; CHECK: uqxtn {{h[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqmovn1.i = call <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32> %vqmovn.i)
+  %0 = extractelement <1 x i16> %vqmovn1.i, i32 0
+  ret i16 %0
+}
+
+define i32 @test_vqmovnd_u64(i64 %a) {
+; CHECK: test_vqmovnd_u64
+; CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vqmovn.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqmovn1.i = call <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64> %vqmovn.i)
+  %0 = extractelement <1 x i32> %vqmovn1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i8> @llvm.arm.neon.vqmovnu.v1i8(<1 x i16>)
+declare <1 x i16> @llvm.arm.neon.vqmovnu.v1i16(<1 x i32>)
+declare <1 x i32> @llvm.arm.neon.vqmovnu.v1i32(<1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-fabd.ll b/test/CodeGen/AArch64/neon-scalar-fabd.ll
new file mode 100644
index 0000000..75686d3
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-fabd.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define float @test_vabds_f32(float %a, float %b) {
+; CHECK-LABEL: test_vabds_f32
+; CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vabd.i = insertelement <1 x float> undef, float %a, i32 0
+  %vabd1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vabd2.i = call <1 x float> @llvm.aarch64.neon.vabd.v1f32(<1 x float> %vabd.i, <1 x float> %vabd1.i)
+  %0 = extractelement <1 x float> %vabd2.i, i32 0
+  ret float %0
+}
+
+define double @test_vabdd_f64(double %a, double %b) {
+; CHECK-LABEL: test_vabdd_f64
+; CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vabd.i = insertelement <1 x double> undef, double %a, i32 0
+  %vabd1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vabd2.i = call <1 x double> @llvm.aarch64.neon.vabd.v1f64(<1 x double> %vabd.i, <1 x double> %vabd1.i)
+  %0 = extractelement <1 x double> %vabd2.i, i32 0
+  ret double %0
+}
+
+declare <1 x double> @llvm.aarch64.neon.vabd.v1f64(<1 x double>, <1 x double>)
+declare <1 x float> @llvm.aarch64.neon.vabd.v1f32(<1 x float>, <1 x float>)
diff --git a/test/CodeGen/AArch64/neon-scalar-fcvt.ll b/test/CodeGen/AArch64/neon-scalar-fcvt.ll
new file mode 100644
index 0000000..d7b84fa
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-fcvt.ll
@@ -0,0 +1,255 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+;; Scalar Floating-point Convert
+
+define float @test_vcvtxn(double %a) {
+; CHECK: test_vcvtxn
+; CHECK: fcvtxn {{s[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtf.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtf1.i = tail call <1 x float> @llvm.aarch64.neon.fcvtxn.v1f32.v1f64(<1 x double> %vcvtf.i)
+  %0 = extractelement <1 x float> %vcvtf1.i, i32 0
+  ret float %0
+}
+
+declare <1 x float> @llvm.aarch64.neon.fcvtxn.v1f32.v1f64(<1 x double>)
+
+define i32 @test_vcvtass(float %a) {
+; CHECK: test_vcvtass
+; CHECK: fcvtas {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtas.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtas1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.v1f32(<1 x float> %vcvtas.i)
+  %0 = extractelement <1 x i32> %vcvtas1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtas.v1i32.v1f32(<1 x float>)
+
+define i64 @test_test_vcvtasd(double %a) {
+; CHECK: test_test_vcvtasd
+; CHECK: fcvtas {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtas.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtas1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %vcvtas.i)
+  %0 = extractelement <1 x i64> %vcvtas1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtaus(float %a) {
+; CHECK: test_vcvtaus
+; CHECK: fcvtau {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtau.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtau1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.v1f32(<1 x float> %vcvtau.i)
+  %0 = extractelement <1 x i32> %vcvtau1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtau.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtaud(double %a) {
+; CHECK: test_vcvtaud
+; CHECK: fcvtau {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtau.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtau1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %vcvtau.i)
+  %0 = extractelement <1 x i64> %vcvtau1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double>) 
+
+define i32 @test_vcvtmss(float %a) {
+; CHECK: test_vcvtmss
+; CHECK: fcvtms {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtms.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtms1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.v1f32(<1 x float> %vcvtms.i)
+  %0 = extractelement <1 x i32> %vcvtms1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtms.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtmd_s64_f64(double %a) {
+; CHECK: test_vcvtmd_s64_f64
+; CHECK: fcvtms {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtms.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtms1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %vcvtms.i)
+  %0 = extractelement <1 x i64> %vcvtms1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtmus(float %a) {
+; CHECK: test_vcvtmus
+; CHECK: fcvtmu {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtmu.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtmu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.v1f32(<1 x float> %vcvtmu.i)
+  %0 = extractelement <1 x i32> %vcvtmu1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtmu.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtmud(double %a) {
+; CHECK: test_vcvtmud
+; CHECK: fcvtmu {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtmu.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtmu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %vcvtmu.i)
+  %0 = extractelement <1 x i64> %vcvtmu1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtnss(float %a) {
+; CHECK: test_vcvtnss
+; CHECK: fcvtns {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtns.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtns1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.v1f32(<1 x float> %vcvtns.i)
+  %0 = extractelement <1 x i32> %vcvtns1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtns.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtnd_s64_f64(double %a) {
+; CHECK: test_vcvtnd_s64_f64
+; CHECK: fcvtns {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtns.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtns1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %vcvtns.i)
+  %0 = extractelement <1 x i64> %vcvtns1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtnus(float %a) {
+; CHECK: test_vcvtnus
+; CHECK: fcvtnu {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtnu.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtnu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.v1f32(<1 x float> %vcvtnu.i)
+  %0 = extractelement <1 x i32> %vcvtnu1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtnu.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtnud(double %a) {
+; CHECK: test_vcvtnud
+; CHECK: fcvtnu {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtnu.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtnu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %vcvtnu.i)
+  %0 = extractelement <1 x i64> %vcvtnu1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtpss(float %a) {
+; CHECK: test_vcvtpss
+; CHECK: fcvtps {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtps.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtps1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.v1f32(<1 x float> %vcvtps.i)
+  %0 = extractelement <1 x i32> %vcvtps1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtps.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtpd_s64_f64(double %a) {
+; CHECK: test_vcvtpd_s64_f64
+; CHECK: fcvtps {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtps.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtps1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %vcvtps.i)
+  %0 = extractelement <1 x i64> %vcvtps1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtpus(float %a) {
+; CHECK: test_vcvtpus
+; CHECK: fcvtpu {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtpu.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtpu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.v1f32(<1 x float> %vcvtpu.i)
+  %0 = extractelement <1 x i32> %vcvtpu1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtpu.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtpud(double %a) {
+; CHECK: test_vcvtpud
+; CHECK: fcvtpu {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtpu.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtpu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %vcvtpu.i)
+  %0 = extractelement <1 x i64> %vcvtpu1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtss(float %a) {
+; CHECK: test_vcvtss
+; CHECK: fcvtzs {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtzs.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtzs1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.v1f32(<1 x float> %vcvtzs.i)
+  %0 = extractelement <1 x i32> %vcvtzs1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtzs.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtd_s64_f64(double %a) {
+; CHECK: test_vcvtd_s64_f64
+; CHECK: fcvtzs {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvzs.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvzs1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %vcvzs.i)
+  %0 = extractelement <1 x i64> %vcvzs1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double>)
+
+define i32 @test_vcvtus(float %a) {
+; CHECK: test_vcvtus
+; CHECK: fcvtzu {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcvtzu.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcvtzu1.i = tail call <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.v1f32(<1 x float> %vcvtzu.i)
+  %0 = extractelement <1 x i32> %vcvtzu1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.fcvtzu.v1i32.v1f32(<1 x float>)
+
+define i64 @test_vcvtud(double %a) {
+; CHECK: test_vcvtud
+; CHECK: fcvtzu {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcvtzu.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcvtzu1.i = tail call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %vcvtzu.i)
+  %0 = extractelement <1 x i64> %vcvtzu1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double>)
diff --git a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
new file mode 100644
index 0000000..a6e5859
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll
@@ -0,0 +1,328 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+;; Scalar Floating-point Compare
+
+define i32 @test_vceqs_f32(float %a, float %b) {
+; CHECK: test_vceqs_f32
+; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vceq.i = insertelement <1 x float> undef, float %a, i32 0
+  %vceq1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vceq2.i = call <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float> %vceq.i, <1 x float> %vceq1.i)
+  %0 = extractelement <1 x i32> %vceq2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vceqd_f64(double %a, double %b) {
+; CHECK: test_vceqd_f64
+; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vceq.i = insertelement <1 x double> undef, double %a, i32 0
+  %vceq1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vceq2.i = call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f64(<1 x double> %vceq.i, <1 x double> %vceq1.i)
+  %0 = extractelement <1 x i64> %vceq2.i, i32 0
+  ret i64 %0
+}
+
+define <1 x i64> @test_vceqz_f64(<1 x double> %a) #0 {
+; CHECK: test_vceqz_f64
+; CHECK: fcmeq  {{d[0-9]+}}, {{d[0-9]+}}, #0.0
+entry:
+  %0 = fcmp oeq <1 x double> %a, zeroinitializer
+  %vceqz.i = zext <1 x i1> %0 to <1 x i64>
+  ret <1 x i64> %vceqz.i
+}
+
+define i32 @test_vceqzs_f32(float %a) {
+; CHECK: test_vceqzs_f32
+; CHECK: fcmeq {{s[0-9]}}, {{s[0-9]}}, #0.0
+entry:
+  %vceq.i = insertelement <1 x float> undef, float %a, i32 0
+  %vceq1.i = call <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float> %vceq.i, <1 x float> zeroinitializer)
+  %0 = extractelement <1 x i32> %vceq1.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vceqzd_f64(double %a) {
+; CHECK: test_vceqzd_f64
+; CHECK: fcmeq {{d[0-9]}}, {{d[0-9]}}, #0.0
+entry:
+  %vceq.i = insertelement <1 x double> undef, double %a, i32 0
+  %vceq1.i = tail call <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f32(<1 x double> %vceq.i, <1 x float> zeroinitializer) #5
+  %0 = extractelement <1 x i64> %vceq1.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcges_f32(float %a, float %b) {
+; CHECK: test_vcges_f32
+; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcge1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcge2.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> %vcge1.i)
+  %0 = extractelement <1 x i32> %vcge2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcged_f64(double %a, double %b) {
+; CHECK: test_vcged_f64
+; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcge1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double> %vcge.i, <1 x double> %vcge1.i)
+  %0 = extractelement <1 x i64> %vcge2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcgezs_f32(float %a) {
+; CHECK: test_vcgezs_f32
+; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, #0.0
+entry:
+  %vcge.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcge1.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> zeroinitializer)
+  %0 = extractelement <1 x i32> %vcge1.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcgezd_f64(double %a) {
+; CHECK: test_vcgezd_f64
+; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, #0.0
+entry:
+  %vcge.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcge1.i = tail call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f32(<1 x double> %vcge.i, <1 x float> zeroinitializer) #5
+  %0 = extractelement <1 x i64> %vcge1.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcgts_f32(float %a, float %b) {
+; CHECK: test_vcgts_f32
+; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcgt1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcgt2.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> %vcgt1.i)
+  %0 = extractelement <1 x i32> %vcgt2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcgtd_f64(double %a, double %b) {
+; CHECK: test_vcgtd_f64
+; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcgt1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double> %vcgt.i, <1 x double> %vcgt1.i)
+  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcgtzs_f32(float %a) {
+; CHECK: test_vcgtzs_f32
+; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, #0.0
+entry:
+  %vcgt.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcgt1.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> zeroinitializer)
+  %0 = extractelement <1 x i32> %vcgt1.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcgtzd_f64(double %a) {
+; CHECK: test_vcgtzd_f64
+; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, #0.0
+entry:
+  %vcgt.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcgt1.i = tail call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f32(<1 x double> %vcgt.i, <1 x float> zeroinitializer) #5
+  %0 = extractelement <1 x i64> %vcgt1.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcles_f32(float %a, float %b) {
+; CHECK: test_vcles_f32
+; CHECK: fcmge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcge1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcge2.i = call <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float> %vcge.i, <1 x float> %vcge1.i)
+  %0 = extractelement <1 x i32> %vcge2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcled_f64(double %a, double %b) {
+; CHECK: test_vcled_f64
+; CHECK: fcmge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcge.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcge1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcge2.i = call <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double> %vcge.i, <1 x double> %vcge1.i)
+  %0 = extractelement <1 x i64> %vcge2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vclezs_f32(float %a) {
+; CHECK: test_vclezs_f32
+; CHECK: fcmle {{s[0-9]}}, {{s[0-9]}}, #0.0
+entry:
+  %vcle.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcle1.i = call <1 x i32> @llvm.aarch64.neon.vclez.v1i32.v1f32.v1f32(<1 x float> %vcle.i, <1 x float> zeroinitializer)
+  %0 = extractelement <1 x i32> %vcle1.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vclezd_f64(double %a) {
+; CHECK: test_vclezd_f64
+; CHECK: fcmle {{d[0-9]}}, {{d[0-9]}}, #0.0
+entry:
+  %vcle.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcle1.i = tail call <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1f64.v1f32(<1 x double> %vcle.i, <1 x float> zeroinitializer) #5
+  %0 = extractelement <1 x i64> %vcle1.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vclts_f32(float %a, float %b) {
+; CHECK: test_vclts_f32
+; CHECK: fcmgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcgt1.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcgt2.i = call <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float> %vcgt.i, <1 x float> %vcgt1.i)
+  %0 = extractelement <1 x i32> %vcgt2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcltd_f64(double %a, double %b) {
+; CHECK: test_vcltd_f64
+; CHECK: fcmgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcgt.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcgt1.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcgt2.i = call <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double> %vcgt.i, <1 x double> %vcgt1.i)
+  %0 = extractelement <1 x i64> %vcgt2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcltzs_f32(float %a) {
+; CHECK: test_vcltzs_f32
+; CHECK: fcmlt {{s[0-9]}}, {{s[0-9]}}, #0.0
+entry:
+  %vclt.i = insertelement <1 x float> undef, float %a, i32 0
+  %vclt1.i = call <1 x i32> @llvm.aarch64.neon.vcltz.v1i32.v1f32.v1f32(<1 x float> %vclt.i, <1 x float> zeroinitializer)
+  %0 = extractelement <1 x i32> %vclt1.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcltzd_f64(double %a) {
+; CHECK: test_vcltzd_f64
+; CHECK: fcmlt {{d[0-9]}}, {{d[0-9]}}, #0.0
+entry:
+  %vclt.i = insertelement <1 x double> undef, double %a, i32 0
+  %vclt1.i = tail call <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1f64.v1f32(<1 x double> %vclt.i, <1 x float> zeroinitializer) #5
+  %0 = extractelement <1 x i64> %vclt1.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcages_f32(float %a, float %b) {
+; CHECK: test_vcages_f32
+; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcage.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcage1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcage2.i = call <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float> %vcage.i, <1 x float> %vcage1.i)
+  %0 = extractelement <1 x i32> %vcage2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcaged_f64(double %a, double %b) {
+; CHECK: test_vcaged_f64
+; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcage.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcage1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcage2.i = call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %vcage.i, <1 x double> %vcage1.i)
+  %0 = extractelement <1 x i64> %vcage2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcagts_f32(float %a, float %b) {
+; CHECK: test_vcagts_f32
+; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcagt.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcagt1.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcagt2.i = call <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float> %vcagt.i, <1 x float> %vcagt1.i)
+  %0 = extractelement <1 x i32> %vcagt2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcagtd_f64(double %a, double %b) {
+; CHECK: test_vcagtd_f64
+; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcagt.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcagt1.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcagt2.i = call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %vcagt.i, <1 x double> %vcagt1.i)
+  %0 = extractelement <1 x i64> %vcagt2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcales_f32(float %a, float %b) {
+; CHECK: test_vcales_f32
+; CHECK: facge {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcage.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcage1.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcage2.i = call <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float> %vcage.i, <1 x float> %vcage1.i)
+  %0 = extractelement <1 x i32> %vcage2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcaled_f64(double %a, double %b) {
+; CHECK: test_vcaled_f64
+; CHECK: facge {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcage.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcage1.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcage2.i = call <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double> %vcage.i, <1 x double> %vcage1.i)
+  %0 = extractelement <1 x i64> %vcage2.i, i32 0
+  ret i64 %0
+}
+
+define i32 @test_vcalts_f32(float %a, float %b) {
+; CHECK: test_vcalts_f32
+; CHECK: facgt {{s[0-9]}}, {{s[0-9]}}, {{s[0-9]}}
+entry:
+  %vcalt.i = insertelement <1 x float> undef, float %b, i32 0
+  %vcalt1.i = insertelement <1 x float> undef, float %a, i32 0
+  %vcalt2.i = call <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float> %vcalt.i, <1 x float> %vcalt1.i)
+  %0 = extractelement <1 x i32> %vcalt2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vcaltd_f64(double %a, double %b) {
+; CHECK: test_vcaltd_f64
+; CHECK: facgt {{d[0-9]}}, {{d[0-9]}}, {{d[0-9]}}
+entry:
+  %vcalt.i = insertelement <1 x double> undef, double %b, i32 0
+  %vcalt1.i = insertelement <1 x double> undef, double %a, i32 0
+  %vcalt2.i = call <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double> %vcalt.i, <1 x double> %vcalt1.i)
+  %0 = extractelement <1 x i64> %vcalt2.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vceq.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vceq.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i32> @llvm.aarch64.neon.vcge.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcge.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i32> @llvm.aarch64.neon.vclez.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vclez.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.vcgt.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcgt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i32> @llvm.aarch64.neon.vcltz.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcltz.v1i64.v1f64.v1f32(<1 x double>, <1 x float>)
+declare <1 x i32> @llvm.aarch64.neon.vcage.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcage.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
+declare <1 x i32> @llvm.aarch64.neon.vcagt.v1i32.v1f32.v1f32(<1 x float>, <1 x float>)
+declare <1 x i64> @llvm.aarch64.neon.vcagt.v1i64.v1f64.v1f64(<1 x double>, <1 x double>)
diff --git a/test/CodeGen/AArch64/neon-scalar-mul.ll b/test/CodeGen/AArch64/neon-scalar-mul.ll
new file mode 100644
index 0000000..991037f
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-mul.ll
@@ -0,0 +1,143 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) {
+; CHECK: test_vqdmulhh_s16
+; CHECK: sqdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
+  %1 = insertelement <1 x i16> undef, i16 %a, i32 0
+  %2 = insertelement <1 x i16> undef, i16 %b, i32 0
+  %3 = call <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
+  %4 = extractelement <1 x i16> %3, i32 0
+  ret i16 %4
+}
+
+define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) {
+; CHECK: test_vqdmulhs_s32
+; CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %1 = insertelement <1 x i32> undef, i32 %a, i32 0
+  %2 = insertelement <1 x i32> undef, i32 %b, i32 0
+  %3 = call <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
+  %4 = extractelement <1 x i32> %3, i32 0
+  ret i32 %4
+}
+
+declare <1 x i16> @llvm.arm.neon.vqdmulh.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i32> @llvm.arm.neon.vqdmulh.v1i32(<1 x i32>, <1 x i32>)
+
+define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) {
+; CHECK: test_vqrdmulhh_s16
+; CHECK: sqrdmulh {{h[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
+  %1 = insertelement <1 x i16> undef, i16 %a, i32 0
+  %2 = insertelement <1 x i16> undef, i16 %b, i32 0
+  %3 = call <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16> %1, <1 x i16> %2)
+  %4 = extractelement <1 x i16> %3, i32 0
+  ret i16 %4
+}
+
+define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) {
+; CHECK: test_vqrdmulhs_s32
+; CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %1 = insertelement <1 x i32> undef, i32 %a, i32 0
+  %2 = insertelement <1 x i32> undef, i32 %b, i32 0
+  %3 = call <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32> %1, <1 x i32> %2)
+  %4 = extractelement <1 x i32> %3, i32 0
+  ret i32 %4
+}
+
+declare <1 x i16> @llvm.arm.neon.vqrdmulh.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i32> @llvm.arm.neon.vqrdmulh.v1i32(<1 x i32>, <1 x i32>)
+
+define float @test_vmulxs_f32(float %a, float %b) {
+; CHECK: test_vmulxs_f32
+; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %1 = call float @llvm.aarch64.neon.vmulx.f32(float %a, float %b)
+  ret float %1
+}
+
+define double @test_vmulxd_f64(double %a, double %b) {
+; CHECK: test_vmulxd_f64
+; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  %1 = call double @llvm.aarch64.neon.vmulx.f64(double %a, double %b)
+  ret double %1
+}
+
+declare float @llvm.aarch64.neon.vmulx.f32(float, float)
+declare double @llvm.aarch64.neon.vmulx.f64(double, double)
+
+define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) {
+; CHECK: test_vqdmlalh_s16
+; CHECK: sqdmlal {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqdmlal.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqdmlal1.i = insertelement <1 x i16> undef, i16 %b, i32 0
+  %vqdmlal2.i = insertelement <1 x i16> undef, i16 %c, i32 0
+  %vqdmlal3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32> %vqdmlal.i, <1 x i16> %vqdmlal1.i, <1 x i16> %vqdmlal2.i)
+  %0 = extractelement <1 x i32> %vqdmlal3.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) {
+; CHECK: test_vqdmlals_s32
+; CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqdmlal.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqdmlal1.i = insertelement <1 x i32> undef, i32 %b, i32 0
+  %vqdmlal2.i = insertelement <1 x i32> undef, i32 %c, i32 0
+  %vqdmlal3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64> %vqdmlal.i, <1 x i32> %vqdmlal1.i, <1 x i32> %vqdmlal2.i)
+  %0 = extractelement <1 x i64> %vqdmlal3.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqdmlal.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
+declare <1 x i64> @llvm.aarch64.neon.vqdmlal.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
+
+define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) {
+; CHECK: test_vqdmlslh_s16
+; CHECK: sqdmlsl {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqdmlsl.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqdmlsl1.i = insertelement <1 x i16> undef, i16 %b, i32 0
+  %vqdmlsl2.i = insertelement <1 x i16> undef, i16 %c, i32 0
+  %vqdmlsl3.i = call <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32> %vqdmlsl.i, <1 x i16> %vqdmlsl1.i, <1 x i16> %vqdmlsl2.i)
+  %0 = extractelement <1 x i32> %vqdmlsl3.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) {
+; CHECK: test_vqdmlsls_s32
+; CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqdmlsl.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqdmlsl1.i = insertelement <1 x i32> undef, i32 %b, i32 0
+  %vqdmlsl2.i = insertelement <1 x i32> undef, i32 %c, i32 0
+  %vqdmlsl3.i = call <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64> %vqdmlsl.i, <1 x i32> %vqdmlsl1.i, <1 x i32> %vqdmlsl2.i)
+  %0 = extractelement <1 x i64> %vqdmlsl3.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqdmlsl.v1i32(<1 x i32>, <1 x i16>, <1 x i16>)
+declare <1 x i64> @llvm.aarch64.neon.vqdmlsl.v1i64(<1 x i64>, <1 x i32>, <1 x i32>)
+
+define i32 @test_vqdmullh_s16(i16 %a, i16 %b) {
+; CHECK: test_vqdmullh_s16
+; CHECK: sqdmull {{s[0-9]+}}, {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqdmull.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqdmull1.i = insertelement <1 x i16> undef, i16 %b, i32 0
+  %vqdmull2.i = call <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16> %vqdmull.i, <1 x i16> %vqdmull1.i)
+  %0 = extractelement <1 x i32> %vqdmull2.i, i32 0
+  ret i32 %0
+}
+
+define i64 @test_vqdmulls_s32(i32 %a, i32 %b) {
+; CHECK: test_vqdmulls_s32
+; CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqdmull.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqdmull1.i = insertelement <1 x i32> undef, i32 %b, i32 0
+  %vqdmull2.i = call <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32> %vqdmull.i, <1 x i32> %vqdmull1.i)
+  %0 = extractelement <1 x i64> %vqdmull2.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i32> @llvm.arm.neon.vqdmull.v1i32(<1 x i16>, <1 x i16>)
+declare <1 x i64> @llvm.arm.neon.vqdmull.v1i64(<1 x i32>, <1 x i32>)
diff --git a/test/CodeGen/AArch64/neon-scalar-neg.ll b/test/CodeGen/AArch64/neon-scalar-neg.ll
new file mode 100644
index 0000000..4dc9d51
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-neg.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define i64 @test_vnegd_s64(i64 %a) {
+; CHECK: test_vnegd_s64
+; CHECK: neg {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vneg1.i = tail call <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64> %vneg.i)
+  %0 = extractelement <1 x i64> %vneg1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vneg(<1 x i64>)
+
+define i8 @test_vqnegb_s8(i8 %a) {
+; CHECK: test_vqnegb_s8
+; CHECK: sqneg {{b[0-9]+}}, {{b[0-9]+}}
+entry:
+  %vqneg.i = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vqneg1.i = call <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8> %vqneg.i)
+  %0 = extractelement <1 x i8> %vqneg1.i, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.arm.neon.vqneg.v1i8(<1 x i8>)
+
+define i16 @test_vqnegh_s16(i16 %a) {
+; CHECK: test_vqnegh_s16
+; CHECK: sqneg {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vqneg.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vqneg1.i = call <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16> %vqneg.i)
+  %0 = extractelement <1 x i16> %vqneg1.i, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.arm.neon.vqneg.v1i16(<1 x i16>)
+
+define i32 @test_vqnegs_s32(i32 %a) {
+; CHECK: test_vqnegs_s32
+; CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vqneg.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vqneg1.i = call <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32> %vqneg.i)
+  %0 = extractelement <1 x i32> %vqneg1.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.arm.neon.vqneg.v1i32(<1 x i32>)
+
+define i64 @test_vqnegd_s64(i64 %a) {
+; CHECK: test_vqnegd_s64
+; CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vqneg.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vqneg1.i = call <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64> %vqneg.i)
+  %0 = extractelement <1 x i64> %vqneg1.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.arm.neon.vqneg.v1i64(<1 x i64>)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-scalar-recip.ll b/test/CodeGen/AArch64/neon-scalar-recip.ll
new file mode 100644
index 0000000..f21c27b
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-recip.ll
@@ -0,0 +1,116 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define float @test_vrecpss_f32(float %a, float %b) {
+; CHECK: test_vrecpss_f32
+; CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %1 = insertelement <1 x float> undef, float %a, i32 0
+  %2 = insertelement <1 x float> undef, float %b, i32 0
+  %3 = call <1 x float> @llvm.arm.neon.vrecps.v1f32(<1 x float> %1, <1 x float> %2)
+  %4 = extractelement <1 x float> %3, i32 0
+  ret float %4
+}
+
+define double @test_vrecpsd_f64(double %a, double %b) {
+; CHECK: test_vrecpsd_f64
+; CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  %1 = insertelement <1 x double> undef, double %a, i32 0
+  %2 = insertelement <1 x double> undef, double %b, i32 0
+  %3 = call <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double> %1, <1 x double> %2)
+  %4 = extractelement <1 x double> %3, i32 0
+  ret double %4
+}
+
+declare <1 x float> @llvm.arm.neon.vrecps.v1f32(<1 x float>, <1 x float>)
+declare <1 x double> @llvm.arm.neon.vrecps.v1f64(<1 x double>, <1 x double>)
+
+define float @test_vrsqrtss_f32(float %a, float %b) {
+; CHECK: test_vrsqrtss_f32
+; CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %1 = insertelement <1 x float> undef, float %a, i32 0
+  %2 = insertelement <1 x float> undef, float %b, i32 0
+  %3 = call <1 x float> @llvm.arm.neon.vrsqrts.v1f32(<1 x float> %1, <1 x float> %2)
+  %4 = extractelement <1 x float> %3, i32 0
+  ret float %4
+}
+
+define double @test_vrsqrtsd_f64(double %a, double %b) {
+; CHECK: test_vrsqrtsd_f64
+; CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  %1 = insertelement <1 x double> undef, double %a, i32 0
+  %2 = insertelement <1 x double> undef, double %b, i32 0
+  %3 = call <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double> %1, <1 x double> %2)
+  %4 = extractelement <1 x double> %3, i32 0
+  ret double %4
+}
+
+declare <1 x float> @llvm.arm.neon.vrsqrts.v1f32(<1 x float>, <1 x float>)
+declare <1 x double> @llvm.arm.neon.vrsqrts.v1f64(<1 x double>, <1 x double>)
+
+define float @test_vrecpes_f32(float %a) {
+; CHECK: test_vrecpes_f32
+; CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vrecpe.i = insertelement <1 x float> undef, float %a, i32 0
+  %vrecpe1.i = tail call <1 x float> @llvm.arm.neon.vrecpe.v1f32(<1 x float> %vrecpe.i)
+  %0 = extractelement <1 x float> %vrecpe1.i, i32 0
+  ret float %0
+}
+
+define double @test_vrecped_f64(double %a) {
+; CHECK: test_vrecped_f64
+; CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vrecpe.i = insertelement <1 x double> undef, double %a, i32 0
+  %vrecpe1.i = tail call <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double> %vrecpe.i)
+  %0 = extractelement <1 x double> %vrecpe1.i, i32 0
+  ret double %0
+}
+
+declare <1 x float> @llvm.arm.neon.vrecpe.v1f32(<1 x float>)
+declare <1 x double> @llvm.arm.neon.vrecpe.v1f64(<1 x double>)
+
+define float @test_vrecpxs_f32(float %a) {
+; CHECK: test_vrecpxs_f32
+; CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vrecpx.i = insertelement <1 x float> undef, float %a, i32 0
+  %vrecpx1.i = tail call <1 x float> @llvm.aarch64.neon.vrecpx.v1f32(<1 x float> %vrecpx.i)
+  %0 = extractelement <1 x float> %vrecpx1.i, i32 0
+  ret float %0
+}
+
+define double @test_vrecpxd_f64(double %a) {
+; CHECK: test_vrecpxd_f64
+; CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vrecpx.i = insertelement <1 x double> undef, double %a, i32 0
+  %vrecpx1.i = tail call <1 x double> @llvm.aarch64.neon.vrecpx.v1f64(<1 x double> %vrecpx.i)
+  %0 = extractelement <1 x double> %vrecpx1.i, i32 0
+  ret double %0
+}
+
+declare <1 x float> @llvm.aarch64.neon.vrecpx.v1f32(<1 x float>)
+declare <1 x double> @llvm.aarch64.neon.vrecpx.v1f64(<1 x double>)
+
+define float @test_vrsqrtes_f32(float %a) {
+; CHECK: test_vrsqrtes_f32
+; CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vrsqrte.i = insertelement <1 x float> undef, float %a, i32 0
+  %vrsqrte1.i = tail call <1 x float> @llvm.arm.neon.vrsqrte.v1f32(<1 x float> %vrsqrte.i)
+  %0 = extractelement <1 x float> %vrsqrte1.i, i32 0
+  ret float %0
+}
+
+define double @test_vrsqrted_f64(double %a) {
+; CHECK: test_vrsqrted_f64
+; CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vrsqrte.i = insertelement <1 x double> undef, double %a, i32 0
+  %vrsqrte1.i = tail call <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double> %vrsqrte.i)
+  %0 = extractelement <1 x double> %vrsqrte1.i, i32 0
+  ret double %0
+}
+
+declare <1 x float> @llvm.arm.neon.vrsqrte.v1f32(<1 x float>)
+declare <1 x double> @llvm.arm.neon.vrsqrte.v1f64(<1 x double>)
diff --git a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
new file mode 100644
index 0000000..80e8dc3
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
@@ -0,0 +1,247 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64>)
+
+define <1 x i64> @test_addp_v1i64(<2 x i64> %a) {
+; CHECK: test_addp_v1i64:
+        %val = call <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64> %a)
+; CHECK: addp d0, v0.2d
+        ret <1 x i64> %val
+}
+
+declare <1 x float> @llvm.aarch64.neon.vpfadd(<2 x float>)
+
+define <1 x float> @test_faddp_v1f32(<2 x float> %a) {
+; CHECK: test_faddp_v1f32:
+        %val = call <1 x float> @llvm.aarch64.neon.vpfadd(<2 x float> %a)
+; CHECK: faddp s0, v0.2s
+        ret <1 x float> %val
+}
+
+declare <1 x double> @llvm.aarch64.neon.vpfaddq(<2 x double>)
+
+define <1 x double> @test_faddp_v1f64(<2 x double> %a) {
+; CHECK: test_faddp_v1f64:
+        %val = call <1 x double> @llvm.aarch64.neon.vpfaddq(<2 x double> %a)
+; CHECK: faddp d0, v0.2d
+        ret <1 x double> %val
+}
+
+
+declare <1 x float> @llvm.aarch64.neon.vpmax(<2 x float>)
+
+define <1 x float> @test_fmaxp_v1f32(<2 x float> %a) {
+; CHECK: test_fmaxp_v1f32:
+        %val = call <1 x float> @llvm.aarch64.neon.vpmax(<2 x float> %a)
+; CHECK: fmaxp s0, v0.2s
+        ret <1 x float> %val
+}
+
+declare <1 x double> @llvm.aarch64.neon.vpmaxq(<2 x double>)
+
+define <1 x double> @test_fmaxp_v1f64(<2 x double> %a) {
+; CHECK: test_fmaxp_v1f64:
+        %val = call <1 x double> @llvm.aarch64.neon.vpmaxq(<2 x double> %a)
+; CHECK: fmaxp d0, v0.2d
+        ret <1 x double> %val
+}
+
+
+declare <1 x float> @llvm.aarch64.neon.vpmin(<2 x float>)
+
+define <1 x float> @test_fminp_v1f32(<2 x float> %a) {
+; CHECK: test_fminp_v1f32:
+        %val = call <1 x float> @llvm.aarch64.neon.vpmin(<2 x float> %a)
+; CHECK: fminp s0, v0.2s
+        ret <1 x float> %val
+}
+
+declare <1 x double> @llvm.aarch64.neon.vpminq(<2 x double>)
+
+define <1 x double> @test_fminp_v1f64(<2 x double> %a) {
+; CHECK: test_fminp_v1f64:
+        %val = call <1 x double> @llvm.aarch64.neon.vpminq(<2 x double> %a)
+; CHECK: fminp d0, v0.2d
+        ret <1 x double> %val
+}
+
+declare <1 x float> @llvm.aarch64.neon.vpfmaxnm(<2 x float>)
+
+define <1 x float> @test_fmaxnmp_v1f32(<2 x float> %a) {
+; CHECK: test_fmaxnmp_v1f32:
+        %val = call <1 x float> @llvm.aarch64.neon.vpfmaxnm(<2 x float> %a)
+; CHECK: fmaxnmp s0, v0.2s
+        ret <1 x float> %val
+}
+
+declare <1 x double> @llvm.aarch64.neon.vpfmaxnmq(<2 x double>)
+
+define <1 x double> @test_fmaxnmp_v1f64(<2 x double> %a) {
+; CHECK: test_fmaxnmp_v1f64:
+        %val = call <1 x double> @llvm.aarch64.neon.vpfmaxnmq(<2 x double> %a)
+; CHECK: fmaxnmp d0, v0.2d
+        ret <1 x double> %val
+}
+
+declare <1 x float> @llvm.aarch64.neon.vpfminnm(<2 x float>)
+
+define <1 x float> @test_fminnmp_v1f32(<2 x float> %a) {
+; CHECK: test_fminnmp_v1f32:
+        %val = call <1 x float> @llvm.aarch64.neon.vpfminnm(<2 x float> %a)
+; CHECK: fminnmp s0, v0.2s
+        ret <1 x float> %val
+}
+
+declare <1 x double> @llvm.aarch64.neon.vpfminnmq(<2 x double>)
+
+define <1 x double> @test_fminnmp_v1f64(<2 x double> %a) {
+; CHECK: test_fminnmp_v1f64:
+        %val = call <1 x double> @llvm.aarch64.neon.vpfminnmq(<2 x double> %a)
+; CHECK: fminnmp d0, v0.2d
+        ret <1 x double> %val
+}
+
+define float @test_vaddv_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vaddv_f32
+; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v2f32(<2 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define float @test_vaddvq_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vaddvq_f32
+; CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v4f32(<4 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define double @test_vaddvq_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vaddvq_f64
+; CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vaddv.v1f64.v2f64(<2 x double> %a)
+  %2 = extractelement <1 x double> %1, i32 0
+  ret double %2
+}
+
+define float @test_vmaxv_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vmaxv_f32
+; CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v2f32(<2 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define double @test_vmaxvq_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vmaxvq_f64
+; CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxv.v1f64.v2f64(<2 x double> %a)
+  %2 = extractelement <1 x double> %1, i32 0
+  ret double %2
+}
+
+define float @test_vminv_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vminv_f32
+; CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vminv.v1f32.v2f32(<2 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define double @test_vminvq_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vminvq_f64
+; CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vminv.v1f64.v2f64(<2 x double> %a)
+  %2 = extractelement <1 x double> %1, i32 0
+  ret double %2
+}
+
+define double @test_vmaxnmvq_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vmaxnmvq_f64
+; CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vmaxnmv.v1f64.v2f64(<2 x double> %a)
+  %2 = extractelement <1 x double> %1, i32 0
+  ret double %2
+}
+
+define float @test_vmaxnmv_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vmaxnmv_f32
+; CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v2f32(<2 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define double @test_vminnmvq_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vminnmvq_f64
+; CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vminnmv.v1f64.v2f64(<2 x double> %a)
+  %2 = extractelement <1 x double> %1, i32 0
+  ret double %2
+}
+
+define float @test_vminnmv_f32(<2 x float> %a) {
+; CHECK-LABEL: test_vminnmv_f32
+; CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
+  %1 = tail call <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v2f32(<2 x float> %a)
+  %2 = extractelement <1 x float> %1, i32 0
+  ret float %2
+}
+
+define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vpaddq_s64
+; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+  %1 = tail call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vpaddq_u64
+; CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+  %1 = tail call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %1
+}
+
+define i64 @test_vaddvq_s64(<2 x i64> %a) {
+; CHECK-LABEL: test_vaddvq_s64
+; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
+  %2 = extractelement <1 x i64> %1, i32 0
+  ret i64 %2
+}
+
+define i64 @test_vaddvq_u64(<2 x i64> %a) {
+; CHECK-LABEL: test_vaddvq_u64
+; CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64> %a)
+  %2 = extractelement <1 x i64> %1, i32 0
+  ret i64 %2
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vaddv.v1i64.v2i64(<2 x i64>)
+
+declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
+
+declare <1 x float> @llvm.aarch64.neon.vminnmv.v1f32.v2f32(<2 x float>)
+
+declare <1 x double> @llvm.aarch64.neon.vminnmv.v1f64.v2f64(<2 x double>)
+
+declare <1 x float> @llvm.aarch64.neon.vmaxnmv.v1f32.v2f32(<2 x float>)
+
+declare <1 x double> @llvm.aarch64.neon.vmaxnmv.v1f64.v2f64(<2 x double>)
+
+declare <1 x double> @llvm.aarch64.neon.vminv.v1f64.v2f64(<2 x double>)
+
+declare <1 x float> @llvm.aarch64.neon.vminv.v1f32.v2f32(<2 x float>)
+
+declare <1 x double> @llvm.aarch64.neon.vmaxv.v1f64.v2f64(<2 x double>)
+
+declare <1 x float> @llvm.aarch64.neon.vmaxv.v1f32.v2f32(<2 x float>)
+
+declare <1 x double> @llvm.aarch64.neon.vaddv.v1f64.v2f64(<2 x double>)
+
+declare <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v4f32(<4 x float>)
+
+declare <1 x float> @llvm.aarch64.neon.vaddv.v1f32.v2f32(<2 x float>)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
new file mode 100644
index 0000000..83ceb4e
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+
+declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_urshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: urshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_srshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: srshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_urshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_urshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshldu(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: urshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_srshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_srshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vrshlds(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: srshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+
+
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
new file mode 100644
index 0000000..bd66f80
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
@@ -0,0 +1,242 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8>, <1 x i8>)
+declare <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8>, <1 x i8>)
+
+define <1 x i8> @test_uqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_uqadd_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: uqadd {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+define <1 x i8> @test_sqadd_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_sqadd_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: sqadd {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+declare <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8>, <1 x i8>)
+declare <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8>, <1 x i8>)
+
+define <1 x i8> @test_uqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_uqsub_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: uqsub {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+define <1 x i8> @test_sqsub_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_sqsub_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.arm.neon.vqsubs.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: sqsub {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+declare <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16>, <1 x i16>)
+
+define <1 x i16> @test_uqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_uqadd_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.arm.neon.vqaddu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: uqadd {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+define <1 x i16> @test_sqadd_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_sqadd_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.arm.neon.vqadds.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: sqadd {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+declare <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16>, <1 x i16>)
+
+define <1 x i16> @test_uqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_uqsub_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: uqsub {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+define <1 x i16> @test_sqsub_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_sqsub_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.arm.neon.vqsubs.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: sqsub {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+declare <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32>, <1 x i32>)
+declare <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32>, <1 x i32>)
+
+define <1 x i32> @test_uqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_uqadd_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.arm.neon.vqaddu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: uqadd {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+define <1 x i32> @test_sqadd_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_sqadd_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.arm.neon.vqadds.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: sqadd {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+declare <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32>, <1 x i32>)
+declare <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32>, <1 x i32>)
+
+define <1 x i32> @test_uqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_uqsub_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: uqsub {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+
+define <1 x i32> @test_sqsub_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_sqsub_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.arm.neon.vqsubs.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: sqsub {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqadd_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqadd {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqadd_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqadd_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqadd {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqsub_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqsub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqsub_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqsub_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqsub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define i8 @test_vuqaddb_s8(i8 %a, i8 %b) {
+; CHECK: test_vuqaddb_s8
+; CHECK: suqadd {{b[0-9]+}}, {{b[0-9]+}}
+entry:
+  %vuqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vuqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
+  %vuqadd2.i = call <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8> %vuqadd.i, <1 x i8> %vuqadd1.i)
+  %0 = extractelement <1 x i8> %vuqadd2.i, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8>, <1 x i8>)
+
+define i16 @test_vuqaddh_s16(i16 %a, i16 %b) {
+; CHECK: test_vuqaddh_s16
+; CHECK: suqadd {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vuqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vuqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
+  %vuqadd2.i = call <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16> %vuqadd.i, <1 x i16> %vuqadd1.i)
+  %0 = extractelement <1 x i16> %vuqadd2.i, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16>, <1 x i16>)
+
+define i32 @test_vuqadds_s32(i32 %a, i32 %b) {
+; CHECK: test_vuqadds_s32
+; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vuqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vuqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
+  %vuqadd2.i = call <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32> %vuqadd.i, <1 x i32> %vuqadd1.i)
+  %0 = extractelement <1 x i32> %vuqadd2.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32>, <1 x i32>)
+
+define i64 @test_vuqaddd_s64(i64 %a, i64 %b) {
+; CHECK: test_vuqaddd_s64
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vuqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vuqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vuqadd2.i = call <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64> %vuqadd.i, <1 x i64> %vuqadd1.i)
+  %0 = extractelement <1 x i64> %vuqadd2.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64>, <1 x i64>)
+
+define i8 @test_vsqaddb_u8(i8 %a, i8 %b) {
+; CHECK: test_vsqaddb_u8
+; CHECK: usqadd {{b[0-9]+}}, {{b[0-9]+}}
+entry:
+  %vsqadd.i = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vsqadd1.i = insertelement <1 x i8> undef, i8 %b, i32 0
+  %vsqadd2.i = call <1 x i8> @llvm.aarch64.neon.vsqadd.v1i8(<1 x i8> %vsqadd.i, <1 x i8> %vsqadd1.i)
+  %0 = extractelement <1 x i8> %vsqadd2.i, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vuqadd.v1i8(<1 x i8>, <1 x i8>)
+
+define i16 @test_vsqaddh_u16(i16 %a, i16 %b) {
+; CHECK: test_vsqaddh_u16
+; CHECK: usqadd {{h[0-9]+}}, {{h[0-9]+}}
+entry:
+  %vsqadd.i = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqadd1.i = insertelement <1 x i16> undef, i16 %b, i32 0
+  %vsqadd2.i = call <1 x i16> @llvm.aarch64.neon.vsqadd.v1i16(<1 x i16> %vsqadd.i, <1 x i16> %vsqadd1.i)
+  %0 = extractelement <1 x i16> %vsqadd2.i, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vuqadd.v1i16(<1 x i16>, <1 x i16>)
+
+define i32 @test_vsqadds_u32(i32 %a, i32 %b) {
+; CHECK: test_vsqadds_u32
+; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
+entry:
+  %vsqadd.i = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqadd1.i = insertelement <1 x i32> undef, i32 %b, i32 0
+  %vsqadd2.i = call <1 x i32> @llvm.aarch64.neon.vsqadd.v1i32(<1 x i32> %vsqadd.i, <1 x i32> %vsqadd1.i)
+  %0 = extractelement <1 x i32> %vsqadd2.i, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vuqadd.v1i32(<1 x i32>, <1 x i32>)
+
+define i64 @test_vsqaddd_u64(i64 %a, i64 %b) {
+; CHECK: test_vsqaddd_u64
+; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
+entry:
+  %vsqadd.i = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqadd1.i = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsqadd2.i = call <1 x i64> @llvm.aarch64.neon.vsqadd.v1i64(<1 x i64> %vsqadd.i, <1 x i64> %vsqadd1.i)
+  %0 = extractelement <1 x i64> %vsqadd2.i, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vuqadd.v1i64(<1 x i64>, <1 x i64>)
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
new file mode 100644
index 0000000..0fd67df
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
@@ -0,0 +1,94 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqrshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqrshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8>, <1 x i8>)
+declare <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8>, <1 x i8>)
+
+define <1 x i8> @test_uqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_uqrshl_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: uqrshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+
+  ret <1 x i8> %tmp1
+}
+
+define <1 x i8> @test_sqrshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_sqrshl_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqrshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: sqrshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16>, <1 x i16>)
+
+define <1 x i16> @test_uqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_uqrshl_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: uqrshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+
+  ret <1 x i16> %tmp1
+}
+
+define <1 x i16> @test_sqrshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_sqrshl_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqrshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: sqrshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32>, <1 x i32>)
+declare <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32>, <1 x i32>)
+
+define <1 x i32> @test_uqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_uqrshl_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: uqrshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+
+  ret <1 x i32> %tmp1
+}
+
+define <1 x i32> @test_sqrshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_sqrshl_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqrshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: sqrshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqrshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqrshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqrshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqrshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqrshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+
+
diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
new file mode 100644
index 0000000..8fdea24
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
@@ -0,0 +1,88 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8>, <1 x i8>)
+declare <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8>, <1 x i8>)
+
+define <1 x i8> @test_uqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_uqshl_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: uqshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+define <1 x i8> @test_sqshl_v1i8_aarch64(<1 x i8> %lhs, <1 x i8> %rhs) {
+; CHECK: test_sqshl_v1i8_aarch64:
+  %tmp1 = call <1 x i8> @llvm.aarch64.neon.vqshls.v1i8(<1 x i8> %lhs, <1 x i8> %rhs)
+;CHECK: sqshl {{b[0-31]+}}, {{b[0-31]+}}, {{b[0-31]+}}
+  ret <1 x i8> %tmp1
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16>, <1 x i16>)
+declare <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16>, <1 x i16>)
+
+define <1 x i16> @test_uqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_uqshl_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: uqshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+define <1 x i16> @test_sqshl_v1i16_aarch64(<1 x i16> %lhs, <1 x i16> %rhs) {
+; CHECK: test_sqshl_v1i16_aarch64:
+  %tmp1 = call <1 x i16> @llvm.aarch64.neon.vqshls.v1i16(<1 x i16> %lhs, <1 x i16> %rhs)
+;CHECK: sqshl {{h[0-31]+}}, {{h[0-31]+}}, {{h[0-31]+}}
+  ret <1 x i16> %tmp1
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32>, <1 x i32>)
+declare <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32>, <1 x i32>)
+
+define <1 x i32> @test_uqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_uqshl_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: uqshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+define <1 x i32> @test_sqshl_v1i32_aarch64(<1 x i32> %lhs, <1 x i32> %rhs) {
+; CHECK: test_sqshl_v1i32_aarch64:
+  %tmp1 = call <1 x i32> @llvm.aarch64.neon.vqshls.v1i32(<1 x i32> %lhs, <1 x i32> %rhs)
+;CHECK: sqshl {{s[0-31]+}}, {{s[0-31]+}}, {{s[0-31]+}}
+  ret <1 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: uqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vqshls.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+;CHECK: sqshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+
diff --git a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
new file mode 100644
index 0000000..6224361
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll
@@ -0,0 +1,531 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define i64 @test_vshrd_n_s64(i64 %a) {
+; CHECK: test_vshrd_n_s64
+; CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsshr = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsshr1 = call <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64> %vsshr, i32 63)
+  %0 = extractelement <1 x i64> %vsshr1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vshrds.n(<1 x i64>, i32)
+
+define i64 @test_vshrd_n_u64(i64 %a) {
+; CHECK: test_vshrd_n_u64
+; CHECK: ushr {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vushr = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vushr1 = call <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64> %vushr, i32 63)
+  %0 = extractelement <1 x i64> %vushr1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vshrdu.n(<1 x i64>, i32)
+
+define i64 @test_vrshrd_n_s64(i64 %a) {
+; CHECK: test_vrshrd_n_s64
+; CHECK: srshr {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsrshr = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsrshr1 = call <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64> %vsrshr, i32 63)
+  %0 = extractelement <1 x i64> %vsrshr1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsrshr.v1i64(<1 x i64>, i32)
+
+define i64 @test_vrshrd_n_u64(i64 %a) {
+; CHECK: test_vrshrd_n_u64
+; CHECK: urshr {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vurshr = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vurshr1 = call <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64> %vurshr, i32 63)
+  %0 = extractelement <1 x i64> %vurshr1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vurshr.v1i64(<1 x i64>, i32)
+
+define i64 @test_vsrad_n_s64(i64 %a, i64 %b) {
+; CHECK: test_vsrad_n_s64
+; CHECK: ssra {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vssra = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vssra1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vssra2 = call <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64> %vssra, <1 x i64> %vssra1, i32 63)
+  %0 = extractelement <1 x i64> %vssra2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsrads.n(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vsrad_n_u64(i64 %a, i64 %b) {
+; CHECK: test_vsrad_n_u64
+; CHECK: usra {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vusra = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vusra1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vusra2 = call <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64> %vusra, <1 x i64> %vusra1, i32 63)
+  %0 = extractelement <1 x i64> %vusra2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsradu.n(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) {
+; CHECK: test_vrsrad_n_s64
+; CHECK: srsra {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsrsra = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsrsra1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsrsra2 = call <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64> %vsrsra, <1 x i64> %vsrsra1, i32 63)
+  %0 = extractelement <1 x i64> %vsrsra2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vrsrads.n(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) {
+; CHECK: test_vrsrad_n_u64
+; CHECK: ursra {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vursra = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vursra1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vursra2 = call <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64> %vursra, <1 x i64> %vursra1, i32 63)
+  %0 = extractelement <1 x i64> %vursra2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vrsradu.n(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vshld_n_s64(i64 %a) {
+; CHECK: test_vshld_n_s64
+; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
+  %0 = extractelement <1 x i64> %vshl1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64>, i32)
+
+define i64 @test_vshld_n_u64(i64 %a) {
+; CHECK: test_vshld_n_u64
+; CHECK: shl {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vshl = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vshl1 = call <1 x i64> @llvm.aarch64.neon.vshld.n(<1 x i64> %vshl, i32 63)
+  %0 = extractelement <1 x i64> %vshl1, i32 0
+  ret i64 %0
+}
+
+define i8 @test_vqshlb_n_s8(i8 %a) {
+; CHECK: test_vqshlb_n_s8
+; CHECK: sqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
+entry:
+  %vsqshl = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vsqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8> %vsqshl, i32 7)
+  %0 = extractelement <1 x i8> %vsqshl1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vqshls.n.v1i8(<1 x i8>, i32)
+
+define i16 @test_vqshlh_n_s16(i16 %a) {
+; CHECK: test_vqshlh_n_s16
+; CHECK: sqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
+entry:
+  %vsqshl = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16> %vsqshl, i32 15)
+  %0 = extractelement <1 x i16> %vsqshl1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vqshls.n.v1i16(<1 x i16>, i32)
+
+define i32 @test_vqshls_n_s32(i32 %a) {
+; CHECK: test_vqshls_n_s32
+; CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
+entry:
+  %vsqshl = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32> %vsqshl, i32 31)
+  %0 = extractelement <1 x i32> %vsqshl1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqshls.n.v1i32(<1 x i32>, i32)
+
+define i64 @test_vqshld_n_s64(i64 %a) {
+; CHECK: test_vqshld_n_s64
+; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsqshl = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64> %vsqshl, i32 63)
+  %0 = extractelement <1 x i64> %vsqshl1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vqshls.n.v1i64(<1 x i64>, i32)
+
+define i8 @test_vqshlb_n_u8(i8 %a) {
+; CHECK: test_vqshlb_n_u8
+; CHECK: uqshl {{b[0-9]+}}, {{b[0-9]+}}, #7
+entry:
+  %vuqshl = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vuqshl1 = call <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8> %vuqshl, i32 7)
+  %0 = extractelement <1 x i8> %vuqshl1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vqshlu.n.v1i8(<1 x i8>, i32)
+
+define i16 @test_vqshlh_n_u16(i16 %a) {
+; CHECK: test_vqshlh_n_u16
+; CHECK: uqshl {{h[0-9]+}}, {{h[0-9]+}}, #15
+entry:
+  %vuqshl = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vuqshl1 = call <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16> %vuqshl, i32 15)
+  %0 = extractelement <1 x i16> %vuqshl1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vqshlu.n.v1i16(<1 x i16>, i32)
+
+define i32 @test_vqshls_n_u32(i32 %a) {
+; CHECK: test_vqshls_n_u32
+; CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
+entry:
+  %vuqshl = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vuqshl1 = call <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32> %vuqshl, i32 31)
+  %0 = extractelement <1 x i32> %vuqshl1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vqshlu.n.v1i32(<1 x i32>, i32)
+
+define i64 @test_vqshld_n_u64(i64 %a) {
+; CHECK: test_vqshld_n_u64
+; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vuqshl = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vuqshl1 = call <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64> %vuqshl, i32 63)
+  %0 = extractelement <1 x i64> %vuqshl1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vqshlu.n.v1i64(<1 x i64>, i32)
+
+define i8 @test_vqshlub_n_s8(i8 %a) {
+; CHECK: test_vqshlub_n_s8
+; CHECK: sqshlu {{b[0-9]+}}, {{b[0-9]+}}, #7
+entry:
+  %vsqshlu = insertelement <1 x i8> undef, i8 %a, i32 0
+  %vsqshlu1 = call <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8> %vsqshlu, i32 7)
+  %0 = extractelement <1 x i8> %vsqshlu1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqshlu.v1i8(<1 x i8>, i32)
+
+define i16 @test_vqshluh_n_s16(i16 %a) {
+; CHECK: test_vqshluh_n_s16
+; CHECK: sqshlu {{h[0-9]+}}, {{h[0-9]+}}, #15
+entry:
+  %vsqshlu = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqshlu1 = call <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16> %vsqshlu, i32 15)
+  %0 = extractelement <1 x i16> %vsqshlu1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqshlu.v1i16(<1 x i16>, i32)
+
+define i32 @test_vqshlus_n_s32(i32 %a) {
+; CHECK: test_vqshlus_n_s32
+; CHECK: sqshlu {{s[0-9]+}}, {{s[0-9]+}}, #31
+entry:
+  %vsqshlu = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqshlu1 = call <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32> %vsqshlu, i32 31)
+  %0 = extractelement <1 x i32> %vsqshlu1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqshlu.v1i32(<1 x i32>, i32)
+
+define i64 @test_vqshlud_n_s64(i64 %a) {
+; CHECK: test_vqshlud_n_s64
+; CHECK: sqshlu {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsqshlu = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqshlu1 = call <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64> %vsqshlu, i32 63)
+  %0 = extractelement <1 x i64> %vsqshlu1, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsqshlu.v1i64(<1 x i64>, i32)
+
+define i64 @test_vsrid_n_s64(i64 %a, i64 %b) {
+; CHECK: test_vsrid_n_s64
+; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
+  %0 = extractelement <1 x i64> %vsri2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vsrid_n_u64(i64 %a, i64 %b) {
+; CHECK: test_vsrid_n_u64
+; CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsri = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsri1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsri2 = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> %vsri, <1 x i64> %vsri1, i32 63)
+  %0 = extractelement <1 x i64> %vsri2, i32 0
+  ret i64 %0
+}
+
+define i64 @test_vslid_n_s64(i64 %a, i64 %b) {
+; CHECK: test_vslid_n_s64
+; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
+  %0 = extractelement <1 x i64> %vsli2, i32 0
+  ret i64 %0
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32)
+
+define i64 @test_vslid_n_u64(i64 %a, i64 %b) {
+; CHECK: test_vslid_n_u64
+; CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
+entry:
+  %vsli = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsli1 = insertelement <1 x i64> undef, i64 %b, i32 0
+  %vsli2 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %vsli, <1 x i64> %vsli1, i32 63)
+  %0 = extractelement <1 x i64> %vsli2, i32 0
+  ret i64 %0
+}
+
+define i8 @test_vqshrnh_n_s16(i16 %a) {
+; CHECK: test_vqshrnh_n_s16
+; CHECK: sqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vsqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16> %vsqshrn, i32 8)
+  %0 = extractelement <1 x i8> %vsqshrn1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqshrn.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqshrns_n_s32(i32 %a) {
+; CHECK: test_vqshrns_n_s32
+; CHECK: sqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vsqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32> %vsqshrn, i32 16)
+  %0 = extractelement <1 x i16> %vsqshrn1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqshrn.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqshrnd_n_s64(i64 %a) {
+; CHECK: test_vqshrnd_n_s64
+; CHECK: sqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vsqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64> %vsqshrn, i32 32)
+  %0 = extractelement <1 x i32> %vsqshrn1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqshrn.v1i32(<1 x i64>, i32)
+
+define i8 @test_vqshrnh_n_u16(i16 %a) {
+; CHECK: test_vqshrnh_n_u16
+; CHECK: uqshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vuqshrn = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vuqshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16> %vuqshrn, i32 8)
+  %0 = extractelement <1 x i8> %vuqshrn1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vuqshrn.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqshrns_n_u32(i32 %a) {
+; CHECK: test_vqshrns_n_u32
+; CHECK: uqshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vuqshrn = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vuqshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32> %vuqshrn, i32 16)
+  %0 = extractelement <1 x i16> %vuqshrn1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vuqshrn.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqshrnd_n_u64(i64 %a) {
+; CHECK: test_vqshrnd_n_u64
+; CHECK: uqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vuqshrn = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vuqshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64> %vuqshrn, i32 32)
+  %0 = extractelement <1 x i32> %vuqshrn1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vuqshrn.v1i32(<1 x i64>, i32)
+
+define i8 @test_vqrshrnh_n_s16(i16 %a) {
+; CHECK: test_vqrshrnh_n_s16
+; CHECK: sqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vsqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16> %vsqrshrn, i32 8)
+  %0 = extractelement <1 x i8> %vsqrshrn1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqrshrn.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqrshrns_n_s32(i32 %a) {
+; CHECK: test_vqrshrns_n_s32
+; CHECK: sqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vsqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32> %vsqrshrn, i32 16)
+  %0 = extractelement <1 x i16> %vsqrshrn1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqrshrn.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqrshrnd_n_s64(i64 %a) {
+; CHECK: test_vqrshrnd_n_s64
+; CHECK: sqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vsqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64> %vsqrshrn, i32 32)
+  %0 = extractelement <1 x i32> %vsqrshrn1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqrshrn.v1i32(<1 x i64>, i32)
+
+define i8 @test_vqrshrnh_n_u16(i16 %a) {
+; CHECK: test_vqrshrnh_n_u16
+; CHECK: uqrshrn {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vuqrshrn = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vuqrshrn1 = call <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16> %vuqrshrn, i32 8)
+  %0 = extractelement <1 x i8> %vuqrshrn1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vuqrshrn.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqrshrns_n_u32(i32 %a) {
+; CHECK: test_vqrshrns_n_u32
+; CHECK: uqrshrn {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vuqrshrn = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vuqrshrn1 = call <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32> %vuqrshrn, i32 16)
+  %0 = extractelement <1 x i16> %vuqrshrn1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vuqrshrn.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqrshrnd_n_u64(i64 %a) {
+; CHECK: test_vqrshrnd_n_u64
+; CHECK: uqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vuqrshrn = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vuqrshrn1 = call <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64> %vuqrshrn, i32 32)
+  %0 = extractelement <1 x i32> %vuqrshrn1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vuqrshrn.v1i32(<1 x i64>, i32)
+
+define i8 @test_vqshrunh_n_s16(i16 %a) {
+; CHECK: test_vqshrunh_n_s16
+; CHECK: sqshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vsqshrun = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16> %vsqshrun, i32 8)
+  %0 = extractelement <1 x i8> %vsqshrun1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqshrun.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqshruns_n_s32(i32 %a) {
+; CHECK: test_vqshruns_n_s32
+; CHECK: sqshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vsqshrun = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32> %vsqshrun, i32 16)
+  %0 = extractelement <1 x i16> %vsqshrun1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqshrun.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqshrund_n_s64(i64 %a) {
+; CHECK: test_vqshrund_n_s64
+; CHECK: sqshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vsqshrun = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64> %vsqshrun, i32 32)
+  %0 = extractelement <1 x i32> %vsqshrun1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqshrun.v1i32(<1 x i64>, i32)
+
+define i8 @test_vqrshrunh_n_s16(i16 %a) {
+; CHECK: test_vqrshrunh_n_s16
+; CHECK: sqrshrun {{b[0-9]+}}, {{h[0-9]+}}, #8
+entry:
+  %vsqrshrun = insertelement <1 x i16> undef, i16 %a, i32 0
+  %vsqrshrun1 = call <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16> %vsqrshrun, i32 8)
+  %0 = extractelement <1 x i8> %vsqrshrun1, i32 0
+  ret i8 %0
+}
+
+declare <1 x i8> @llvm.aarch64.neon.vsqrshrun.v1i8(<1 x i16>, i32)
+
+define i16 @test_vqrshruns_n_s32(i32 %a) {
+; CHECK: test_vqrshruns_n_s32
+; CHECK: sqrshrun {{h[0-9]+}}, {{s[0-9]+}}, #16
+entry:
+  %vsqrshrun = insertelement <1 x i32> undef, i32 %a, i32 0
+  %vsqrshrun1 = call <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32> %vsqrshrun, i32 16)
+  %0 = extractelement <1 x i16> %vsqrshrun1, i32 0
+  ret i16 %0
+}
+
+declare <1 x i16> @llvm.aarch64.neon.vsqrshrun.v1i16(<1 x i32>, i32)
+
+define i32 @test_vqrshrund_n_s64(i64 %a) {
+; CHECK: test_vqrshrund_n_s64
+; CHECK: sqrshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
+entry:
+  %vsqrshrun = insertelement <1 x i64> undef, i64 %a, i32 0
+  %vsqrshrun1 = call <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64> %vsqrshrun, i32 32)
+  %0 = extractelement <1 x i32> %vsqrshrun1, i32 0
+  ret i32 %0
+}
+
+declare <1 x i32> @llvm.aarch64.neon.vsqrshrun.v1i32(<1 x i64>, i32)
diff --git a/test/CodeGen/AArch64/neon-scalar-shift.ll b/test/CodeGen/AArch64/neon-scalar-shift.ll
new file mode 100644
index 0000000..1222be5
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-scalar-shift.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_ushl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: ushl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+declare <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_ushl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_ushl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshldu(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: ushl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sshl_v1i64_aarch64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sshl_v1i64_aarch64:
+  %tmp1 = call <1 x i64> @llvm.aarch64.neon.vshlds(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sshl {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+  ret <1 x i64> %tmp1
+}
+
+
diff --git a/test/CodeGen/AArch64/neon-shift-left-long.ll b/test/CodeGen/AArch64/neon-shift-left-long.ll
new file mode 100644
index 0000000..d45c476
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-shift-left-long.ll
@@ -0,0 +1,193 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i16> @test_sshll_v8i8(<8 x i8> %a) {
+; CHECK: test_sshll_v8i8:
+; CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3
+  %1 = sext <8 x i8> %a to <8 x i16>
+  %tmp = shl <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_sshll_v4i16(<4 x i16> %a) {
+; CHECK: test_sshll_v4i16:
+; CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9
+  %1 = sext <4 x i16> %a to <4 x i32>
+  %tmp = shl <4 x i32> %1, <i32 9, i32 9, i32 9, i32 9>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_sshll_v2i32(<2 x i32> %a) {
+; CHECK: test_sshll_v2i32:
+; CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19
+  %1 = sext <2 x i32> %a to <2 x i64>
+  %tmp = shl <2 x i64> %1, <i64 19, i64 19>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_ushll_v8i8(<8 x i8> %a) {
+; CHECK: test_ushll_v8i8:
+; CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3
+  %1 = zext <8 x i8> %a to <8 x i16>
+  %tmp = shl <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_ushll_v4i16(<4 x i16> %a) {
+; CHECK: test_ushll_v4i16:
+; CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9
+  %1 = zext <4 x i16> %a to <4 x i32>
+  %tmp = shl <4 x i32> %1, <i32 9, i32 9, i32 9, i32 9>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_ushll_v2i32(<2 x i32> %a) {
+; CHECK: test_ushll_v2i32:
+; CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19
+  %1 = zext <2 x i32> %a to <2 x i64>
+  %tmp = shl <2 x i64> %1, <i64 19, i64 19>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_sshll2_v16i8(<16 x i8> %a) {
+; CHECK: test_sshll2_v16i8:
+; CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %tmp = shl <8 x i16> %2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_sshll2_v8i16(<8 x i16> %a) {
+; CHECK: test_sshll2_v8i16:
+; CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %tmp = shl <4 x i32> %2, <i32 9, i32 9, i32 9, i32 9>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_sshll2_v4i32(<4 x i32> %a) {
+; CHECK: test_sshll2_v4i32:
+; CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %2 = sext <2 x i32> %1 to <2 x i64>
+  %tmp = shl <2 x i64> %2, <i64 19, i64 19>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_ushll2_v16i8(<16 x i8> %a) {
+; CHECK: test_ushll2_v16i8:
+; CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %tmp = shl <8 x i16> %2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_ushll2_v8i16(<8 x i16> %a) {
+; CHECK: test_ushll2_v8i16:
+; CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %tmp = shl <4 x i32> %2, <i32 9, i32 9, i32 9, i32 9>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_ushll2_v4i32(<4 x i32> %a) {
+; CHECK: test_ushll2_v4i32:
+; CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %2 = zext <2 x i32> %1 to <2 x i64>
+  %tmp = shl <2 x i64> %2, <i64 19, i64 19>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_sshll_shl0_v8i8(<8 x i8> %a) {
+; CHECK: test_sshll_shl0_v8i8:
+; CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0
+  %tmp = sext <8 x i8> %a to <8 x i16>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_sshll_shl0_v4i16(<4 x i16> %a) {
+; CHECK: test_sshll_shl0_v4i16:
+; CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0
+  %tmp = sext <4 x i16> %a to <4 x i32>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_sshll_shl0_v2i32(<2 x i32> %a) {
+; CHECK: test_sshll_shl0_v2i32:
+; CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0
+  %tmp = sext <2 x i32> %a to <2 x i64>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_ushll_shl0_v8i8(<8 x i8> %a) {
+; CHECK: test_ushll_shl0_v8i8:
+; CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0
+  %tmp = zext <8 x i8> %a to <8 x i16>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_ushll_shl0_v4i16(<4 x i16> %a) {
+; CHECK: test_ushll_shl0_v4i16:
+; CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0
+  %tmp = zext <4 x i16> %a to <4 x i32>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_ushll_shl0_v2i32(<2 x i32> %a) {
+; CHECK: test_ushll_shl0_v2i32:
+; CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0
+  %tmp = zext <2 x i32> %a to <2 x i64>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_sshll2_shl0_v16i8(<16 x i8> %a) {
+; CHECK: test_sshll2_shl0_v16i8:
+; CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp = sext <8 x i8> %1 to <8 x i16>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_sshll2_shl0_v8i16(<8 x i16> %a) {
+; CHECK: test_sshll2_shl0_v8i16:
+; CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp = sext <4 x i16> %1 to <4 x i32>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_sshll2_shl0_v4i32(<4 x i32> %a) {
+; CHECK: test_sshll2_shl0_v4i32:
+; CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp = sext <2 x i32> %1 to <2 x i64>
+  ret <2 x i64> %tmp
+}
+
+define <8 x i16> @test_ushll2_shl0_v16i8(<16 x i8> %a) {
+; CHECK: test_ushll2_shl0_v16i8:
+; CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp = zext <8 x i8> %1 to <8 x i16>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_ushll2_shl0_v8i16(<8 x i16> %a) {
+; CHECK: test_ushll2_shl0_v8i16:
+; CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0
+  %1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp = zext <4 x i16> %1 to <4 x i32>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_ushll2_shl0_v4i32(<4 x i32> %a) {
+; CHECK: test_ushll2_shl0_v4i32:
+; CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0
+  %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp = zext <2 x i32> %1 to <2 x i64>
+  ret <2 x i64> %tmp
+}
diff --git a/test/CodeGen/AArch64/neon-shift.ll b/test/CodeGen/AArch64/neon-shift.ll
index 45a2605..33b04ce 100644
--- a/test/CodeGen/AArch64/neon-shift.ll
+++ b/test/CodeGen/AArch64/neon-shift.ll
@@ -102,23 +102,6 @@ define <4 x i32> @test_sshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
   ret <4 x i32> %tmp1
 }
 
-declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
-declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>)
-
-define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_ushl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: ushl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
-define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
-; CHECK: test_sshl_v1i64:
-  %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
-; CHECK: sshl d0, d0, d1
-  ret <1 x i64> %tmp1
-}
-
 declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>)
 declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>)
 
@@ -137,4 +120,52 @@ define <2 x i64> @test_sshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
 }
 
 
+define <8 x i8> @test_shl_v8i8(<8 x i8> %a) {
+; CHECK: test_shl_v8i8:
+; CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %tmp = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %tmp
+}
+
+define <4 x i16> @test_shl_v4i16(<4 x i16> %a) {
+; CHECK: test_shl_v4i16:
+; CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %tmp = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %tmp
+}
+
+define <2 x i32> @test_shl_v2i32(<2 x i32> %a) {
+; CHECK: test_shl_v2i32:
+; CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %tmp = shl <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %tmp
+}
+
+define <16 x i8> @test_shl_v16i8(<16 x i8> %a) {
+; CHECK: test_shl_v16i8:
+; CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %tmp = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %tmp
+}
+
+define <8 x i16> @test_shl_v8i16(<8 x i16> %a) {
+; CHECK: test_shl_v8i16:
+; CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %tmp = shl <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %tmp
+}
+
+define <4 x i32> @test_shl_v4i32(<4 x i32> %a) {
+; CHECK: test_shl_v4i32:
+; CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %tmp = shl <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %tmp
+}
+
+define <2 x i64> @test_shl_v2i64(<2 x i64> %a) {
+; CHECK: test_shl_v2i64:
+; CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63
+  %tmp = shl <2 x i64> %a, <i64 63, i64 63>
+  ret <2 x i64> %tmp
+}
 
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
new file mode 100644
index 0000000..d5557c0
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
@@ -0,0 +1,2314 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define void @test_ldst1_v16i8(<16 x i8>* %ptr, <16 x i8>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v16i8:
+; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+  %tmp = load <16 x i8>* %ptr
+  store <16 x i8> %tmp, <16 x i8>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v8i16(<8 x i16>* %ptr, <8 x i16>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v8i16:
+; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+  %tmp = load <8 x i16>* %ptr
+  store <8 x i16> %tmp, <8 x i16>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v4i32(<4 x i32>* %ptr, <4 x i32>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v4i32:
+; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %tmp = load <4 x i32>* %ptr
+  store <4 x i32> %tmp, <4 x i32>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v2i64(<2 x i64>* %ptr, <2 x i64>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v2i64:
+; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %tmp = load <2 x i64>* %ptr
+  store <2 x i64> %tmp, <2 x i64>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v8i8(<8 x i8>* %ptr, <8 x i8>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v8i8:
+; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %tmp = load <8 x i8>* %ptr
+  store <8 x i8> %tmp, <8 x i8>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v4i16(<4 x i16>* %ptr, <4 x i16>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v4i16:
+; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %tmp = load <4 x i16>* %ptr
+  store <4 x i16> %tmp, <4 x i16>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v2i32(<2 x i32>* %ptr, <2 x i32>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v2i32:
+; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %tmp = load <2 x i32>* %ptr
+  store <2 x i32> %tmp, <2 x i32>* %ptr2
+  ret void
+}
+
+define void @test_ldst1_v1i64(<1 x i64>* %ptr, <1 x i64>* %ptr2) {
+; CHECK-LABEL: test_ldst1_v1i64:
+; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+; CHECK: st1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %tmp = load <1 x i64>* %ptr
+  store <1 x i64> %tmp, <1 x i64>* %ptr2
+  ret void
+}
+
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.int64x2x2_t = type { [2 x <2 x i64>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.float64x2x2_t = type { [2 x <2 x double>] }
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.int64x1x2_t = type { [2 x <1 x i64>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.float64x1x2_t = type { [2 x <1 x double>] }
+%struct.int8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int16x8x3_t = type { [3 x <8 x i16>] }
+%struct.int32x4x3_t = type { [3 x <4 x i32>] }
+%struct.int64x2x3_t = type { [3 x <2 x i64>] }
+%struct.float32x4x3_t = type { [3 x <4 x float>] }
+%struct.float64x2x3_t = type { [3 x <2 x double>] }
+%struct.int8x8x3_t = type { [3 x <8 x i8>] }
+%struct.int16x4x3_t = type { [3 x <4 x i16>] }
+%struct.int32x2x3_t = type { [3 x <2 x i32>] }
+%struct.int64x1x3_t = type { [3 x <1 x i64>] }
+%struct.float32x2x3_t = type { [3 x <2 x float>] }
+%struct.float64x1x3_t = type { [3 x <1 x double>] }
+%struct.int8x16x4_t = type { [4 x <16 x i8>] }
+%struct.int16x8x4_t = type { [4 x <8 x i16>] }
+%struct.int32x4x4_t = type { [4 x <4 x i32>] }
+%struct.int64x2x4_t = type { [4 x <2 x i64>] }
+%struct.float32x4x4_t = type { [4 x <4 x float>] }
+%struct.float64x2x4_t = type { [4 x <2 x double>] }
+%struct.int8x8x4_t = type { [4 x <8 x i8>] }
+%struct.int16x4x4_t = type { [4 x <4 x i16>] }
+%struct.int32x2x4_t = type { [4 x <2 x i32>] }
+%struct.int64x1x4_t = type { [4 x <1 x i64>] }
+%struct.float32x2x4_t = type { [4 x <2 x float>] }
+%struct.float64x1x4_t = type { [4 x <1 x double>] }
+
+
+define <16 x i8> @test_vld1q_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld1q_s8
+; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+  %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1)
+  ret <16 x i8> %vld1
+}
+
+define <8 x i16> @test_vld1q_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld1q_s16
+; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2)
+  ret <8 x i16> %vld1
+}
+
+define <4 x i32> @test_vld1q_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld1q_s32
+; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4)
+  ret <4 x i32> %vld1
+}
+
+define <2 x i64> @test_vld1q_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld1q_s64
+; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8)
+  ret <2 x i64> %vld1
+}
+
+define <4 x float> @test_vld1q_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld1q_f32
+; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4)
+  ret <4 x float> %vld1
+}
+
+define <2 x double> @test_vld1q_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld1q_f64
+; CHECK: ld1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8)
+  ret <2 x double> %vld1
+}
+
+define <8 x i8> @test_vld1_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld1_s8
+; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
+  ret <8 x i8> %vld1
+}
+
+define <4 x i16> @test_vld1_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld1_s16
+; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
+  ret <4 x i16> %vld1
+}
+
+define <2 x i32> @test_vld1_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld1_s32
+; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4)
+  ret <2 x i32> %vld1
+}
+
+define <1 x i64> @test_vld1_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld1_s64
+; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8)
+  ret <1 x i64> %vld1
+}
+
+define <2 x float> @test_vld1_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld1_f32
+; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4)
+  ret <2 x float> %vld1
+}
+
+define <1 x double> @test_vld1_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld1_f64
+; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8)
+  ret <1 x double> %vld1
+}
+
+define <8 x i8> @test_vld1_p8(i8* readonly %a) {
+; CHECK-LABEL: test_vld1_p8
+; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
+  ret <8 x i8> %vld1
+}
+
+define <4 x i16> @test_vld1_p16(i16* readonly %a) {
+; CHECK-LABEL: test_vld1_p16
+; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
+  ret <4 x i16> %vld1
+}
+
+define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld2q_s8
+; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+  %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1)
+  %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld2q_s16
+; CHECK: ld2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2)
+  %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld2q_s32
+; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4)
+  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld2q_s64
+; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8)
+  %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1
+  ret %struct.int64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld2q_f32
+; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
+  %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld2q_f64
+; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8)
+  %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1
+  ret %struct.float64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld2_s8
+; CHECK: ld2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1)
+  %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld2_s16
+; CHECK: ld2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2)
+  %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld2_s32
+; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4)
+  %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld2_s64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8)
+  %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1
+  ret %struct.int64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld2_f32
+; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4)
+  %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld2_f64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8)
+  %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1
+  ret %struct.float64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld3q_s8
+; CHECK: ld3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+  %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1)
+  %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2
+  ret %struct.int8x16x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld3q_s16
+; CHECK: ld3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2)
+  %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2
+  ret %struct.int16x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld3q_s32
+; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4)
+  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2
+  ret %struct.int32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld3q_s64
+; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8)
+  %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2
+  ret %struct.int64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld3q_f32
+; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4)
+  %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2
+  ret %struct.float32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld3q_f64
+; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8)
+  %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2
+  ret %struct.float64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld3_s8
+; CHECK: ld3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1)
+  %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2
+  ret %struct.int8x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld3_s16
+; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2)
+  %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2
+  ret %struct.int16x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld3_s32
+; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4)
+  %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2
+  ret %struct.int32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld3_s64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8)
+  %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2
+  ret %struct.int64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld3_f32
+; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4)
+  %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2
+  ret %struct.float32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld3_f64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8)
+  %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2
+  ret %struct.float64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld4q_s8
+; CHECK: ld4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+  %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1)
+  %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3
+  ret %struct.int8x16x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld4q_s16
+; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2)
+  %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3
+  ret %struct.int16x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld4q_s32
+; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4)
+  %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3
+  ret %struct.int32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld4q_s64
+; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8)
+  %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3
+  ret %struct.int64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld4q_f32
+; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
+  %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3
+  ret %struct.float32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld4q_f64
+; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8)
+  %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3
+  ret %struct.float64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) {
+; CHECK-LABEL: test_vld4_s8
+; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+  %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1)
+  %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3
+  ret %struct.int8x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) {
+; CHECK-LABEL: test_vld4_s16
+; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2)
+  %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3
+  ret %struct.int16x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) {
+; CHECK-LABEL: test_vld4_s32
+; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4)
+  %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3
+  ret %struct.int32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) {
+; CHECK-LABEL: test_vld4_s64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8)
+  %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3
+  ret %struct.int64x1x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) {
+; CHECK-LABEL: test_vld4_f32
+; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4)
+  %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3
+  ret %struct.float32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) {
+; CHECK-LABEL: test_vld4_f64
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8)
+  %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3
+  ret %struct.float64x1x4_t %.fca.0.3.insert
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32)
+declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32)
+declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32)
+declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32)
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32)
+declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32)
+declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32)
+declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
+declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32)
+declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32)
+declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
+
+define void @test_vst1q_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vst1q_s8
+; CHECK: st1 {v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1)
+  ret void
+}
+
+define void @test_vst1q_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vst1q_s16
+; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2)
+  ret void
+}
+
+define void @test_vst1q_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vst1q_s32
+; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4)
+  ret void
+}
+
+define void @test_vst1q_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vst1q_s64
+; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8)
+  ret void
+}
+
+define void @test_vst1q_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vst1q_f32
+; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4)
+  ret void
+}
+
+define void @test_vst1q_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vst1q_f64
+; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8)
+  ret void
+}
+
+define void @test_vst1_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vst1_s8
+; CHECK: st1 {v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1)
+  ret void
+}
+
+define void @test_vst1_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vst1_s16
+; CHECK: st1 {v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2)
+  ret void
+}
+
+define void @test_vst1_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vst1_s32
+; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4)
+  ret void
+}
+
+define void @test_vst1_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vst1_s64
+; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8)
+  ret void
+}
+
+define void @test_vst1_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vst1_f32
+; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4)
+  ret void
+}
+
+define void @test_vst1_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vst1_f64
+; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8)
+  ret void
+}
+
+define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_s8
+; CHECK: st2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1)
+  ret void
+}
+
+define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_s16
+; CHECK: st2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2)
+  ret void
+}
+
+define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_s32
+; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4)
+  ret void
+}
+
+define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_s64
+; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8)
+  ret void
+}
+
+define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_f32
+; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4)
+  ret void
+}
+
+define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_f64
+; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8)
+  ret void
+}
+
+define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2_s8
+; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1)
+  ret void
+}
+
+define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2_s16
+; CHECK: st2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2)
+  ret void
+}
+
+define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2_s32
+; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4)
+  ret void
+}
+
+define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2_s64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8)
+  ret void
+}
+
+define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2_f32
+; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4)
+  ret void
+}
+
+define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2_f64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8)
+  ret void
+}
+
+define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_s8
+; CHECK: st3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1)
+  ret void
+}
+
+define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_s16
+; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2)
+  ret void
+}
+
+define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_s32
+; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4)
+  ret void
+}
+
+define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_s64
+; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8)
+  ret void
+}
+
+define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_f32
+; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4)
+  ret void
+}
+
+define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_f64
+; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8)
+  ret void
+}
+
+define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3_s8
+; CHECK: st3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1)
+  ret void
+}
+
+define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3_s16
+; CHECK: st3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2)
+  ret void
+}
+
+define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3_s32
+; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4)
+  ret void
+}
+
+define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3_s64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8)
+  ret void
+}
+
+define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3_f32
+; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4)
+  ret void
+}
+
+define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3_f64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8)
+  ret void
+}
+
+define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_s8
+; CHECK: st4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1)
+  ret void
+}
+
+define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_s16
+; CHECK: st4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2)
+  ret void
+}
+
+define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_s32
+; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4)
+  ret void
+}
+
+define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_s64
+; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8)
+  ret void
+}
+
+define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_f32
+; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4)
+  ret void
+}
+
+define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_f64
+; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8)
+  ret void
+}
+
+define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4_s8
+; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1)
+  ret void
+}
+
+define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4_s16
+; CHECK: st4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
+  %1 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2)
+  ret void
+}
+
+define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4_s32
+; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
+  %1 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4)
+  ret void
+}
+
+define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4_s64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
+  %1 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8)
+  ret void
+}
+
+define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4_f32
+; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
+  %1 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4)
+  ret void
+}
+
+define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4_f64
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
+  %1 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32)
+declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32)
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32)
+declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32)
+declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32)
+declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32)
+declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
+declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32)
+declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32)
+declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32)
+declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32)
+declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32)
+declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
+declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
+declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
+declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
+declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
+declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
+declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
+
+define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a)  {
+; CHECK-LABEL: test_vld1q_s8_x2
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %1 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
+  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
+  %3 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
+  %4 = insertvalue %struct.int8x16x2_t undef, <16 x i8> %2, 0, 0
+  %5 = insertvalue %struct.int8x16x2_t %4, <16 x i8> %3, 0, 1
+  ret %struct.int8x16x2_t %5
+}
+
+define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a)  {
+; CHECK-LABEL: test_vld1q_s16_x2
+; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
+  %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
+  %4 = extractvalue { <8 x i16>, <8 x i16> } %2, 1
+  %5 = insertvalue %struct.int16x8x2_t undef, <8 x i16> %3, 0, 0
+  %6 = insertvalue %struct.int16x8x2_t %5, <8 x i16> %4, 0, 1
+  ret %struct.int16x8x2_t %6
+}
+
+define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a)  {
+; CHECK-LABEL: test_vld1q_s32_x2
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
+  %4 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
+  %5 = insertvalue %struct.int32x4x2_t undef, <4 x i32> %3, 0, 0
+  %6 = insertvalue %struct.int32x4x2_t %5, <4 x i32> %4, 0, 1
+  ret %struct.int32x4x2_t %6
+}
+
+define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a)  {
+; CHECK-LABEL: test_vld1q_s64_x2
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x i64>, <2 x i64> } %2, 0
+  %4 = extractvalue { <2 x i64>, <2 x i64> } %2, 1
+  %5 = insertvalue %struct.int64x2x2_t undef, <2 x i64> %3, 0, 0
+  %6 = insertvalue %struct.int64x2x2_t %5, <2 x i64> %4, 0, 1
+  ret %struct.int64x2x2_t %6
+}
+
+define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a)  {
+; CHECK-LABEL: test_vld1q_f32_x2
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float> } %2, 0
+  %4 = extractvalue { <4 x float>, <4 x float> } %2, 1
+  %5 = insertvalue %struct.float32x4x2_t undef, <4 x float> %3, 0, 0
+  %6 = insertvalue %struct.float32x4x2_t %5, <4 x float> %4, 0, 1
+  ret %struct.float32x4x2_t %6
+}
+
+
+define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a)  {
+; CHECK-LABEL: test_vld1q_f64_x2
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x double>, <2 x double> } %2, 0
+  %4 = extractvalue { <2 x double>, <2 x double> } %2, 1
+  %5 = insertvalue %struct.float64x2x2_t undef, <2 x double> %3, 0, 0
+  %6 = insertvalue %struct.float64x2x2_t %5, <2 x double> %4, 0, 1
+  ret %struct.float64x2x2_t %6
+}
+
+define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a)  {
+; CHECK-LABEL: test_vld1_s8_x2
+; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %1 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8* %a, i32 1)
+  %2 = extractvalue { <8 x i8>, <8 x i8> } %1, 0
+  %3 = extractvalue { <8 x i8>, <8 x i8> } %1, 1
+  %4 = insertvalue %struct.int8x8x2_t undef, <8 x i8> %2, 0, 0
+  %5 = insertvalue %struct.int8x8x2_t %4, <8 x i8> %3, 0, 1
+  ret %struct.int8x8x2_t %5
+}
+
+define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a)  {
+; CHECK-LABEL: test_vld1_s16_x2
+; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8* %1, i32 2)
+  %3 = extractvalue { <4 x i16>, <4 x i16> } %2, 0
+  %4 = extractvalue { <4 x i16>, <4 x i16> } %2, 1
+  %5 = insertvalue %struct.int16x4x2_t undef, <4 x i16> %3, 0, 0
+  %6 = insertvalue %struct.int16x4x2_t %5, <4 x i16> %4, 0, 1
+  ret %struct.int16x4x2_t %6
+}
+
+define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a)  {
+; CHECK-LABEL: test_vld1_s32_x2
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x i32>, <2 x i32> } %2, 0
+  %4 = extractvalue { <2 x i32>, <2 x i32> } %2, 1
+  %5 = insertvalue %struct.int32x2x2_t undef, <2 x i32> %3, 0, 0
+  %6 = insertvalue %struct.int32x2x2_t %5, <2 x i32> %4, 0, 1
+  ret %struct.int32x2x2_t %6
+}
+
+define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a)  {
+; CHECK-LABEL: test_vld1_s64_x2
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x i64>, <1 x i64> } %2, 0
+  %4 = extractvalue { <1 x i64>, <1 x i64> } %2, 1
+  %5 = insertvalue %struct.int64x1x2_t undef, <1 x i64> %3, 0, 0
+  %6 = insertvalue %struct.int64x1x2_t %5, <1 x i64> %4, 0, 1
+  ret %struct.int64x1x2_t %6
+}
+
+define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a)  {
+; CHECK-LABEL: test_vld1_f32_x2
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x float>, <2 x float> } %2, 0
+  %4 = extractvalue { <2 x float>, <2 x float> } %2, 1
+  %5 = insertvalue %struct.float32x2x2_t undef, <2 x float> %3, 0, 0
+  %6 = insertvalue %struct.float32x2x2_t %5, <2 x float> %4, 0, 1
+  ret %struct.float32x2x2_t %6
+}
+
+define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a)  {
+; CHECK-LABEL: test_vld1_f64_x2
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x double>, <1 x double> } %2, 0
+  %4 = extractvalue { <1 x double>, <1 x double> } %2, 1
+  %5 = insertvalue %struct.float64x1x2_t undef, <1 x double> %3, 0, 0
+  %6 = insertvalue %struct.float64x1x2_t %5, <1 x double> %4, 0, 1
+  ret %struct.float64x1x2_t %6
+}
+
+define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a)  {
+; CHECK-LABEL: test_vld1q_s8_x3
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
+; [{{x[0-9]+|sp}}]
+  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8* %a, i32 1)
+  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
+  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
+  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
+  %5 = insertvalue %struct.int8x16x3_t undef, <16 x i8> %2, 0, 0
+  %6 = insertvalue %struct.int8x16x3_t %5, <16 x i8> %3, 0, 1
+  %7 = insertvalue %struct.int8x16x3_t %6, <16 x i8> %4, 0, 2
+  ret %struct.int8x16x3_t %7
+}
+
+define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a)  {
+; CHECK-LABEL: test_vld1q_s16_x3
+; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
+  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
+  %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
+  %6 = insertvalue %struct.int16x8x3_t undef, <8 x i16> %3, 0, 0
+  %7 = insertvalue %struct.int16x8x3_t %6, <8 x i16> %4, 0, 1
+  %8 = insertvalue %struct.int16x8x3_t %7, <8 x i16> %5, 0, 2
+  ret %struct.int16x8x3_t %8
+}
+
+define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a)  {
+; CHECK-LABEL: test_vld1q_s32_x3
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
+  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
+  %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
+  %6 = insertvalue %struct.int32x4x3_t undef, <4 x i32> %3, 0, 0
+  %7 = insertvalue %struct.int32x4x3_t %6, <4 x i32> %4, 0, 1
+  %8 = insertvalue %struct.int32x4x3_t %7, <4 x i32> %5, 0, 2
+  ret %struct.int32x4x3_t %8
+}
+
+define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a)  {
+; CHECK-LABEL: test_vld1q_s64_x3
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
+  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
+  %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
+  %6 = insertvalue %struct.int64x2x3_t undef, <2 x i64> %3, 0, 0
+  %7 = insertvalue %struct.int64x2x3_t %6, <2 x i64> %4, 0, 1
+  %8 = insertvalue %struct.int64x2x3_t %7, <2 x i64> %5, 0, 2
+  ret %struct.int64x2x3_t %8
+}
+
+define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a)  {
+; CHECK-LABEL: test_vld1q_f32_x3
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 0
+  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 1
+  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %2, 2
+  %6 = insertvalue %struct.float32x4x3_t undef, <4 x float> %3, 0, 0
+  %7 = insertvalue %struct.float32x4x3_t %6, <4 x float> %4, 0, 1
+  %8 = insertvalue %struct.float32x4x3_t %7, <4 x float> %5, 0, 2
+  ret %struct.float32x4x3_t %8
+}
+
+
+define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a)  {
+; CHECK-LABEL: test_vld1q_f64_x3
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 0
+  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 1
+  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %2, 2
+  %6 = insertvalue %struct.float64x2x3_t undef, <2 x double> %3, 0, 0
+  %7 = insertvalue %struct.float64x2x3_t %6, <2 x double> %4, 0, 1
+  %8 = insertvalue %struct.float64x2x3_t %7, <2 x double> %5, 0, 2
+  ret %struct.float64x2x3_t %8
+}
+
+define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a)  {
+; CHECK-LABEL: test_vld1_s8_x3
+; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
+; [{{x[0-9]+|sp}}]
+  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8* %a, i32 1)
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
+  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
+  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
+  %5 = insertvalue %struct.int8x8x3_t undef, <8 x i8> %2, 0, 0
+  %6 = insertvalue %struct.int8x8x3_t %5, <8 x i8> %3, 0, 1
+  %7 = insertvalue %struct.int8x8x3_t %6, <8 x i8> %4, 0, 2
+  ret %struct.int8x8x3_t %7
+}
+
+define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a)  {
+; CHECK-LABEL: test_vld1_s16_x3
+; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8* %1, i32 2)
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
+  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
+  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
+  %6 = insertvalue %struct.int16x4x3_t undef, <4 x i16> %3, 0, 0
+  %7 = insertvalue %struct.int16x4x3_t %6, <4 x i16> %4, 0, 1
+  %8 = insertvalue %struct.int16x4x3_t %7, <4 x i16> %5, 0, 2
+  ret %struct.int16x4x3_t %8
+}
+
+define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a)  {
+  %1 = bitcast i32* %a to i8*
+; CHECK-LABEL: test_vld1_s32_x3
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
+; [{{x[0-9]+|sp}}]
+  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
+  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
+  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
+  %6 = insertvalue %struct.int32x2x3_t undef, <2 x i32> %3, 0, 0
+  %7 = insertvalue %struct.int32x2x3_t %6, <2 x i32> %4, 0, 1
+  %8 = insertvalue %struct.int32x2x3_t %7, <2 x i32> %5, 0, 2
+  ret %struct.int32x2x3_t %8
+}
+
+define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a)  {
+; CHECK-LABEL: test_vld1_s64_x3
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
+  %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
+  %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
+  %6 = insertvalue %struct.int64x1x3_t undef, <1 x i64> %3, 0, 0
+  %7 = insertvalue %struct.int64x1x3_t %6, <1 x i64> %4, 0, 1
+  %8 = insertvalue %struct.int64x1x3_t %7, <1 x i64> %5, 0, 2
+  ret %struct.int64x1x3_t %8
+}
+
+define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a)  {
+; CHECK-LABEL: test_vld1_f32_x3
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 0
+  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 1
+  %5 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %2, 2
+  %6 = insertvalue %struct.float32x2x3_t undef, <2 x float> %3, 0, 0
+  %7 = insertvalue %struct.float32x2x3_t %6, <2 x float> %4, 0, 1
+  %8 = insertvalue %struct.float32x2x3_t %7, <2 x float> %5, 0, 2
+  ret %struct.float32x2x3_t %8
+}
+
+
+define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a)  {
+; CHECK-LABEL: test_vld1_f64_x3
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
+; [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 0
+  %4 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 1
+  %5 = extractvalue { <1 x double>, <1 x double>, <1 x double> } %2, 2
+  %6 = insertvalue %struct.float64x1x3_t undef, <1 x double> %3, 0, 0
+  %7 = insertvalue %struct.float64x1x3_t %6, <1 x double> %4, 0, 1
+  %8 = insertvalue %struct.float64x1x3_t %7, <1 x double> %5, 0, 2
+  ret %struct.float64x1x3_t %8
+}
+
+define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a)  {
+; CHECK-LABEL: test_vld1q_s8_x4
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
+; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8* %a, i32 1)
+  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 0
+  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 1
+  %4 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 2
+  %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %1, 3
+  %6 = insertvalue %struct.int8x16x4_t undef, <16 x i8> %2, 0, 0
+  %7 = insertvalue %struct.int8x16x4_t %6, <16 x i8> %3, 0, 1
+  %8 = insertvalue %struct.int8x16x4_t %7, <16 x i8> %4, 0, 2
+  %9 = insertvalue %struct.int8x16x4_t %8, <16 x i8> %5, 0, 3
+  ret %struct.int8x16x4_t %9
+}
+
+define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a)  {
+; CHECK-LABEL: test_vld1q_s16_x4
+; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
+; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8* %1, i32 2)
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
+  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 1
+  %5 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 2
+  %6 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %2, 3
+  %7 = insertvalue %struct.int16x8x4_t undef, <8 x i16> %3, 0, 0
+  %8 = insertvalue %struct.int16x8x4_t %7, <8 x i16> %4, 0, 1
+  %9 = insertvalue %struct.int16x8x4_t %8, <8 x i16> %5, 0, 2
+  %10 = insertvalue %struct.int16x8x4_t %9, <8 x i16> %6, 0, 3
+  ret %struct.int16x8x4_t %10
+}
+
+define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a)  {
+; CHECK-LABEL: test_vld1q_s32_x4
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
+; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 0
+  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 1
+  %5 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 2
+  %6 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %2, 3
+  %7 = insertvalue %struct.int32x4x4_t undef, <4 x i32> %3, 0, 0
+  %8 = insertvalue %struct.int32x4x4_t %7, <4 x i32> %4, 0, 1
+  %9 = insertvalue %struct.int32x4x4_t %8, <4 x i32> %5, 0, 2
+  %10 = insertvalue %struct.int32x4x4_t %9, <4 x i32> %6, 0, 3
+  ret %struct.int32x4x4_t %10
+}
+
+define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a)  {
+; CHECK-LABEL: test_vld1q_s64_x4
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
+; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
+  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 1
+  %5 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 2
+  %6 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %2, 3
+  %7 = insertvalue %struct.int64x2x4_t undef, <2 x i64> %3, 0, 0
+  %8 = insertvalue %struct.int64x2x4_t %7, <2 x i64> %4, 0, 1
+  %9 = insertvalue %struct.int64x2x4_t %8, <2 x i64> %5, 0, 2
+  %10 = insertvalue %struct.int64x2x4_t %9, <2 x i64> %6, 0, 3
+  ret %struct.int64x2x4_t %10
+}
+
+define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a)  {
+; CHECK-LABEL: test_vld1q_f32_x4
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
+; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
+  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
+  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
+  %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
+  %7 = insertvalue %struct.float32x4x4_t undef, <4 x float> %3, 0, 0
+  %8 = insertvalue %struct.float32x4x4_t %7, <4 x float> %4, 0, 1
+  %9 = insertvalue %struct.float32x4x4_t %8, <4 x float> %5, 0, 2
+  %10 = insertvalue %struct.float32x4x4_t %9, <4 x float> %6, 0, 3
+  ret %struct.float32x4x4_t %10
+}
+
+define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a)  {
+; CHECK-LABEL: test_vld1q_f64_x4
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
+; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
+  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
+  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
+  %6 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
+  %7 = insertvalue %struct.float64x2x4_t undef, <2 x double> %3, 0, 0
+  %8 = insertvalue %struct.float64x2x4_t %7, <2 x double> %4, 0, 1
+  %9 = insertvalue %struct.float64x2x4_t %8, <2 x double> %5, 0, 2
+  %10 = insertvalue %struct.float64x2x4_t %9, <2 x double> %6, 0, 3
+  ret %struct.float64x2x4_t %10
+}
+
+define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a)  {
+; CHECK-LABEL: test_vld1_s8_x4
+; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
+; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
+  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
+  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
+  %5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 3
+  %6 = insertvalue %struct.int8x8x4_t undef, <8 x i8> %2, 0, 0
+  %7 = insertvalue %struct.int8x8x4_t %6, <8 x i8> %3, 0, 1
+  %8 = insertvalue %struct.int8x8x4_t %7, <8 x i8> %4, 0, 2
+  %9 = insertvalue %struct.int8x8x4_t %8, <8 x i8> %5, 0, 3
+  ret %struct.int8x8x4_t %9
+}
+
+define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a)  {
+; CHECK-LABEL: test_vld1_s16_x4
+; CHECK: ld1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
+; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8* %1, i32 2)
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
+  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
+  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
+  %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %2, 3
+  %7 = insertvalue %struct.int16x4x4_t undef, <4 x i16> %3, 0, 0
+  %8 = insertvalue %struct.int16x4x4_t %7, <4 x i16> %4, 0, 1
+  %9 = insertvalue %struct.int16x4x4_t %8, <4 x i16> %5, 0, 2
+  %10 = insertvalue %struct.int16x4x4_t %9, <4 x i16> %6, 0, 3
+  ret %struct.int16x4x4_t %10
+}
+
+define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a)  {
+; CHECK-LABEL: test_vld1_s32_x4
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
+; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
+  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
+  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
+  %6 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
+  %7 = insertvalue %struct.int32x2x4_t undef, <2 x i32> %3, 0, 0
+  %8 = insertvalue %struct.int32x2x4_t %7, <2 x i32> %4, 0, 1
+  %9 = insertvalue %struct.int32x2x4_t %8, <2 x i32> %5, 0, 2
+  %10 = insertvalue %struct.int32x2x4_t %9, <2 x i32> %6, 0, 3
+  ret %struct.int32x2x4_t %10
+}
+
+define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a)  {
+; CHECK-LABEL: test_vld1_s64_x4
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
+; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 0
+  %4 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 1
+  %5 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 2
+  %6 = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %2, 3
+  %7 = insertvalue %struct.int64x1x4_t undef, <1 x i64> %3, 0, 0
+  %8 = insertvalue %struct.int64x1x4_t %7, <1 x i64> %4, 0, 1
+  %9 = insertvalue %struct.int64x1x4_t %8, <1 x i64> %5, 0, 2
+  %10 = insertvalue %struct.int64x1x4_t %9, <1 x i64> %6, 0, 3
+  ret %struct.int64x1x4_t %10
+}
+
+define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a)  {
+; CHECK-LABEL: test_vld1_f32_x4
+; CHECK: ld1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
+; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8* %1, i32 4)
+  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 0
+  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 1
+  %5 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 2
+  %6 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %2, 3
+  %7 = insertvalue %struct.float32x2x4_t undef, <2 x float> %3, 0, 0
+  %8 = insertvalue %struct.float32x2x4_t %7, <2 x float> %4, 0, 1
+  %9 = insertvalue %struct.float32x2x4_t %8, <2 x float> %5, 0, 2
+  %10 = insertvalue %struct.float32x2x4_t %9, <2 x float> %6, 0, 3
+  ret %struct.float32x2x4_t %10
+}
+
+
+define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a)  {
+; CHECK-LABEL: test_vld1_f64_x4
+; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
+; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8* %1, i32 8)
+  %3 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 0
+  %4 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 1
+  %5 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 2
+  %6 = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %2, 3
+  %7 = insertvalue %struct.float64x1x4_t undef, <1 x double> %3, 0, 0
+  %8 = insertvalue %struct.float64x1x4_t %7, <1 x double> %4, 0, 1
+  %9 = insertvalue %struct.float64x1x4_t %8, <1 x double> %5, 0, 2
+  %10 = insertvalue %struct.float64x1x4_t %9, <1 x double> %6, 0, 3
+  ret %struct.float64x1x4_t %10
+}
+
+define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b)  {
+; CHECK-LABEL: test_vst1q_s8_x2
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <16 x i8>] %b, 0
+  %2 = extractvalue [2 x <16 x i8>] %b, 1
+  tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
+  ret void
+}
+
+define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b)  {
+; CHECK-LABEL: test_vst1q_s16_x2
+; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <8 x i16>] %b, 0
+  %2 = extractvalue [2 x <8 x i16>] %b, 1
+  %3 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
+  ret void
+}
+
+define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b)  {
+; CHECK-LABEL: test_vst1q_s32_x2
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <4 x i32>] %b, 0
+  %2 = extractvalue [2 x <4 x i32>] %b, 1
+  %3 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v4i32(i8* %3, <4 x i32> %1, <4 x i32> %2, i32 4)
+  ret void
+}
+
+define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b)  {
+; CHECK-LABEL: test_vst1q_s64_x2
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <2 x i64>] %b, 0
+  %2 = extractvalue [2 x <2 x i64>] %b, 1
+  %3 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v2i64(i8* %3, <2 x i64> %1, <2 x i64> %2, i32 8)
+  ret void
+}
+
+define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b)  {
+; CHECK-LABEL: test_vst1q_f32_x2
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <4 x float>] %b, 0
+  %2 = extractvalue [2 x <4 x float>] %b, 1
+  %3 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v4f32(i8* %3, <4 x float> %1, <4 x float> %2, i32 4)
+  ret void
+}
+
+
+define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b)  {
+; CHECK-LABEL: test_vst1q_f64_x2
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <2 x double>] %b, 0
+  %2 = extractvalue [2 x <2 x double>] %b, 1
+  %3 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v2f64(i8* %3, <2 x double> %1, <2 x double> %2, i32 8)
+  ret void
+}
+
+define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b)  {
+; CHECK-LABEL: test_vst1_s8_x2
+; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <8 x i8>] %b, 0
+  %2 = extractvalue [2 x <8 x i8>] %b, 1
+  tail call void @llvm.aarch64.neon.vst1x2.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 1)
+  ret void
+}
+
+define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b)  {
+; CHECK-LABEL: test_vst1_s16_x2
+; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <4 x i16>] %b, 0
+  %2 = extractvalue [2 x <4 x i16>] %b, 1
+  %3 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v4i16(i8* %3, <4 x i16> %1, <4 x i16> %2, i32 2)
+  ret void
+}
+
+define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b)  {
+; CHECK-LABEL: test_vst1_s32_x2
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <2 x i32>] %b, 0
+  %2 = extractvalue [2 x <2 x i32>] %b, 1
+  %3 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 4)
+  ret void
+}
+
+define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b)  {
+; CHECK-LABEL: test_vst1_s64_x2
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <1 x i64>] %b, 0
+  %2 = extractvalue [2 x <1 x i64>] %b, 1
+  %3 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v1i64(i8* %3, <1 x i64> %1, <1 x i64> %2, i32 8)
+  ret void
+}
+
+define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b)  {
+; CHECK-LABEL: test_vst1_f32_x2
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <2 x float>] %b, 0
+  %2 = extractvalue [2 x <2 x float>] %b, 1
+  %3 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v2f32(i8* %3, <2 x float> %1, <2 x float> %2, i32 4)
+  ret void
+}
+
+define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b)  {
+; CHECK-LABEL: test_vst1_f64_x2
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [2 x <1 x double>] %b, 0
+  %2 = extractvalue [2 x <1 x double>] %b, 1
+  %3 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v1f64(i8* %3, <1 x double> %1, <1 x double> %2, i32 8)
+  ret void
+}
+
+define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b)  {
+; CHECK-LABEL: test_vst1q_s8_x3
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <16 x i8>] %b, 0
+  %2 = extractvalue [3 x <16 x i8>] %b, 1
+  %3 = extractvalue [3 x <16 x i8>] %b, 2
+  tail call void @llvm.aarch64.neon.vst1x3.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, i32 1)
+  ret void
+}
+
+define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b)  {
+; CHECK-LABEL: test_vst1q_s16_x3
+; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <8 x i16>] %b, 0
+  %2 = extractvalue [3 x <8 x i16>] %b, 1
+  %3 = extractvalue [3 x <8 x i16>] %b, 2
+  %4 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v8i16(i8* %4, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, i32 2)
+  ret void
+}
+
+define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b)  {
+; CHECK-LABEL: test_vst1q_s32_x3
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <4 x i32>] %b, 0
+  %2 = extractvalue [3 x <4 x i32>] %b, 1
+  %3 = extractvalue [3 x <4 x i32>] %b, 2
+  %4 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v4i32(i8* %4, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, i32 4)
+  ret void
+}
+
+define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b)  {
+; CHECK-LABEL: test_vst1q_s64_x3
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <2 x i64>] %b, 0
+  %2 = extractvalue [3 x <2 x i64>] %b, 1
+  %3 = extractvalue [3 x <2 x i64>] %b, 2
+  %4 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v2i64(i8* %4, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, i32 8)
+  ret void
+}
+
+define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b)  {
+; CHECK-LABEL: test_vst1q_f32_x3
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <4 x float>] %b, 0
+  %2 = extractvalue [3 x <4 x float>] %b, 1
+  %3 = extractvalue [3 x <4 x float>] %b, 2
+  %4 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 4)
+  ret void
+}
+
+define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b)  {
+; CHECK-LABEL: test_vst1q_f64_x3
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <2 x double>] %b, 0
+  %2 = extractvalue [3 x <2 x double>] %b, 1
+  %3 = extractvalue [3 x <2 x double>] %b, 2
+  %4 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v2f64(i8* %4, <2 x double> %1, <2 x double> %2, <2 x double> %3, i32 8)
+  ret void
+}
+
+define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b)  {
+; CHECK-LABEL: test_vst1_s8_x3
+; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <8 x i8>] %b, 0
+  %2 = extractvalue [3 x <8 x i8>] %b, 1
+  %3 = extractvalue [3 x <8 x i8>] %b, 2
+  tail call void @llvm.aarch64.neon.vst1x3.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, i32 1)
+  ret void
+}
+
+define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b)  {
+; CHECK-LABEL: test_vst1_s16_x3
+; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <4 x i16>] %b, 0
+  %2 = extractvalue [3 x <4 x i16>] %b, 1
+  %3 = extractvalue [3 x <4 x i16>] %b, 2
+  %4 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 2)
+  ret void
+}
+
+define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b)  {
+; CHECK-LABEL: test_vst1_s32_x3
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <2 x i32>] %b, 0
+  %2 = extractvalue [3 x <2 x i32>] %b, 1
+  %3 = extractvalue [3 x <2 x i32>] %b, 2
+  %4 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
+  ret void
+}
+
+define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b)  {
+; CHECK-LABEL: test_vst1_s64_x3
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <1 x i64>] %b, 0
+  %2 = extractvalue [3 x <1 x i64>] %b, 1
+  %3 = extractvalue [3 x <1 x i64>] %b, 2
+  %4 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
+  ret void
+}
+
+define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b)  {
+; CHECK-LABEL: test_vst1_f32_x3
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <2 x float>] %b, 0
+  %2 = extractvalue [3 x <2 x float>] %b, 1
+  %3 = extractvalue [3 x <2 x float>] %b, 2
+  %4 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 4)
+  ret void
+}
+
+define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b)  {
+; CHECK-LABEL: test_vst1_f64_x3
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d},
+; [{{x[0-9]+|sp}}]
+  %1 = extractvalue [3 x <1 x double>] %b, 0
+  %2 = extractvalue [3 x <1 x double>] %b, 1
+  %3 = extractvalue [3 x <1 x double>] %b, 2
+  %4 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v1f64(i8* %4, <1 x double> %1, <1 x double> %2, <1 x double> %3, i32 8)
+  ret void
+}
+
+define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b)  {
+; CHECK-LABEL: test_vst1q_s8_x4
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b,
+; v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <16 x i8>] %b, 0
+  %2 = extractvalue [4 x <16 x i8>] %b, 1
+  %3 = extractvalue [4 x <16 x i8>] %b, 2
+  %4 = extractvalue [4 x <16 x i8>] %b, 3
+  tail call void @llvm.aarch64.neon.vst1x4.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3, <16 x i8> %4, i32 1)
+  ret void
+}
+
+define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b)  {
+; CHECK-LABEL: test_vst1q_s16_x4
+; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h,
+; v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <8 x i16>] %b, 0
+  %2 = extractvalue [4 x <8 x i16>] %b, 1
+  %3 = extractvalue [4 x <8 x i16>] %b, 2
+  %4 = extractvalue [4 x <8 x i16>] %b, 3
+  %5 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v8i16(i8* %5, <8 x i16> %1, <8 x i16> %2, <8 x i16> %3, <8 x i16> %4, i32 2)
+  ret void
+}
+
+define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b)  {
+; CHECK-LABEL: test_vst1q_s32_x4
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
+; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <4 x i32>] %b, 0
+  %2 = extractvalue [4 x <4 x i32>] %b, 1
+  %3 = extractvalue [4 x <4 x i32>] %b, 2
+  %4 = extractvalue [4 x <4 x i32>] %b, 3
+  %5 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v4i32(i8* %5, <4 x i32> %1, <4 x i32> %2, <4 x i32> %3, <4 x i32> %4, i32 4)
+  ret void
+}
+
+define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b)  {
+; CHECK-LABEL: test_vst1q_s64_x4
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
+; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <2 x i64>] %b, 0
+  %2 = extractvalue [4 x <2 x i64>] %b, 1
+  %3 = extractvalue [4 x <2 x i64>] %b, 2
+  %4 = extractvalue [4 x <2 x i64>] %b, 3
+  %5 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v2i64(i8* %5, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, i32 8)
+  ret void
+}
+
+define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b)  {
+; CHECK-LABEL: test_vst1q_f32_x4
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s,
+; v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <4 x float>] %b, 0
+  %2 = extractvalue [4 x <4 x float>] %b, 1
+  %3 = extractvalue [4 x <4 x float>] %b, 2
+  %4 = extractvalue [4 x <4 x float>] %b, 3
+  %5 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
+  ret void
+}
+
+define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b)  {
+; CHECK-LABEL: test_vst1q_f64_x4
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d,
+; v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <2 x double>] %b, 0
+  %2 = extractvalue [4 x <2 x double>] %b, 1
+  %3 = extractvalue [4 x <2 x double>] %b, 2
+  %4 = extractvalue [4 x <2 x double>] %b, 3
+  %5 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
+  ret void
+}
+
+define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b)  {
+; CHECK-LABEL: test_vst1_s8_x4
+; CHECK: st1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b,
+; v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <8 x i8>] %b, 0
+  %2 = extractvalue [4 x <8 x i8>] %b, 1
+  %3 = extractvalue [4 x <8 x i8>] %b, 2
+  %4 = extractvalue [4 x <8 x i8>] %b, 3
+  tail call void @llvm.aarch64.neon.vst1x4.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, <8 x i8> %3, <8 x i8> %4, i32 1)
+  ret void
+}
+
+define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b)  {
+; CHECK-LABEL: test_vst1_s16_x4
+; CHECK: st1 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h,
+; v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <4 x i16>] %b, 0
+  %2 = extractvalue [4 x <4 x i16>] %b, 1
+  %3 = extractvalue [4 x <4 x i16>] %b, 2
+  %4 = extractvalue [4 x <4 x i16>] %b, 3
+  %5 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v4i16(i8* %5, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, <4 x i16> %4, i32 2)
+  ret void
+}
+
+define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b)  {
+; CHECK-LABEL: test_vst1_s32_x4
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
+; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <2 x i32>] %b, 0
+  %2 = extractvalue [4 x <2 x i32>] %b, 1
+  %3 = extractvalue [4 x <2 x i32>] %b, 2
+  %4 = extractvalue [4 x <2 x i32>] %b, 3
+  %5 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 4)
+  ret void
+}
+
+define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b)  {
+; CHECK-LABEL: test_vst1_s64_x4
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
+; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <1 x i64>] %b, 0
+  %2 = extractvalue [4 x <1 x i64>] %b, 1
+  %3 = extractvalue [4 x <1 x i64>] %b, 2
+  %4 = extractvalue [4 x <1 x i64>] %b, 3
+  %5 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v1i64(i8* %5, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, <1 x i64> %4, i32 8)
+  ret void
+}
+
+define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b)  {
+; CHECK-LABEL: test_vst1_f32_x4
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s,
+; v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <2 x float>] %b, 0
+  %2 = extractvalue [4 x <2 x float>] %b, 1
+  %3 = extractvalue [4 x <2 x float>] %b, 2
+  %4 = extractvalue [4 x <2 x float>] %b, 3
+  %5 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 4)
+  ret void
+}
+
+define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b)  {
+; CHECK-LABEL: test_vst1_f64_x4
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d,
+; v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
+  %1 = extractvalue [4 x <1 x double>] %b, 0
+  %2 = extractvalue [4 x <1 x double>] %b, 1
+  %3 = extractvalue [4 x <1 x double>] %b, 2
+  %4 = extractvalue [4 x <1 x double>] %b, 3
+  %5 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v1f64(i8* %5, <1 x double> %1, <1 x double> %2, <1 x double> %3, <1 x double> %4, i32 8)
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x2.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x2.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x2.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x2.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x2.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x2.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x2.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x2.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x2.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x2.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x3.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x3.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x3.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x3.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x3.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x3.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x3.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x3.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x3.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x3.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x4.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x4.v8i16(i8*, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.vld1x4.v4i32(i8*, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x4.v2i64(i8*, i32)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.vld1x4.v2f64(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.vld1x4.v4i16(i8*, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.vld1x4.v2i32(i8*, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.vld1x4.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.vld1x4.v2f32(i8*, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.vld1x4.v1f64(i8*, i32)
+declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v4f32(i8*, <4 x float>, <4 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v2f64(i8*, <2 x double>, <2 x double>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v2f32(i8*, <2 x float>, <2 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v1f64(i8*, <1 x double>, <1 x double>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
new file mode 100644
index 0000000..3f28320
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
@@ -0,0 +1,2113 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.int64x2x2_t = type { [2 x <2 x i64>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.float64x2x2_t = type { [2 x <2 x double>] }
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.int64x1x2_t = type { [2 x <1 x i64>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.float64x1x2_t = type { [2 x <1 x double>] }
+%struct.int8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int16x8x3_t = type { [3 x <8 x i16>] }
+%struct.int32x4x3_t = type { [3 x <4 x i32>] }
+%struct.int64x2x3_t = type { [3 x <2 x i64>] }
+%struct.float32x4x3_t = type { [3 x <4 x float>] }
+%struct.float64x2x3_t = type { [3 x <2 x double>] }
+%struct.int8x8x3_t = type { [3 x <8 x i8>] }
+%struct.int16x4x3_t = type { [3 x <4 x i16>] }
+%struct.int32x2x3_t = type { [3 x <2 x i32>] }
+%struct.int64x1x3_t = type { [3 x <1 x i64>] }
+%struct.float32x2x3_t = type { [3 x <2 x float>] }
+%struct.float64x1x3_t = type { [3 x <1 x double>] }
+%struct.int8x16x4_t = type { [4 x <16 x i8>] }
+%struct.int16x8x4_t = type { [4 x <8 x i16>] }
+%struct.int32x4x4_t = type { [4 x <4 x i32>] }
+%struct.int64x2x4_t = type { [4 x <2 x i64>] }
+%struct.float32x4x4_t = type { [4 x <4 x float>] }
+%struct.float64x2x4_t = type { [4 x <2 x double>] }
+%struct.int8x8x4_t = type { [4 x <8 x i8>] }
+%struct.int16x4x4_t = type { [4 x <4 x i16>] }
+%struct.int32x2x4_t = type { [4 x <2 x i32>] }
+%struct.int64x1x4_t = type { [4 x <1 x i64>] }
+%struct.float32x2x4_t = type { [4 x <2 x float>] }
+%struct.float64x1x4_t = type { [4 x <1 x double>] }
+
+define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1q_dup_s8
+; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %lane
+}
+
+define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1q_dup_s16
+; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <8 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %lane
+}
+
+define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1q_dup_s32
+; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <4 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %lane
+}
+
+define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1q_dup_s64
+; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <2 x i64> undef, i64 %0, i32 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %lane
+}
+
+define <4 x float> @test_vld1q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1q_dup_f32
+; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <4 x float> undef, float %0, i32 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x double> @test_vld1q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1q_dup_f64
+; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <2 x double> undef, double %0, i32 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <8 x i8> @test_vld1_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1_dup_s8
+; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  ret <8 x i8> %lane
+}
+
+define <4 x i16> @test_vld1_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1_dup_s16
+; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %1 = insertelement <4 x i16> undef, i16 %0, i32 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  ret <4 x i16> %lane
+}
+
+define <2 x i32> @test_vld1_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1_dup_s32
+; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %1 = insertelement <2 x i32> undef, i32 %0, i32 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  ret <2 x i32> %lane
+}
+
+define <1 x i64> @test_vld1_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1_dup_s64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %1 = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %1
+}
+
+define <2 x float> @test_vld1_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1_dup_f32
+; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = load float* %a, align 4
+  %1 = insertelement <2 x float> undef, float %0, i32 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <1 x double> @test_vld1_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1_dup_f64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %1 = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %1
+}
+
+define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld2q_dup_s8
+; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+  %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
+  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+  %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+  ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld2q_dup_s16
+; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld2q_dup_s32
+; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld2q_dup_s64
+; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+  ret %struct.int64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld2q_dup_f32
+; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
+  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld2q_dup_f64
+; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
+  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+  ret %struct.float64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld2_dup_s8
+; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+  %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
+  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+  %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld2_dup_s16
+; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld2_dup_s32
+; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld2_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+  ret %struct.int64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld2_dup_f32
+; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
+  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld2_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+  ret %struct.float64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld3q_dup_s8
+; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
+  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
+  ret %struct.int8x16x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld3q_dup_s16
+; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
+  ret %struct.int16x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld3q_dup_s32
+; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
+  ret %struct.int32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld3q_dup_s64
+; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
+  ret %struct.int64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld3q_dup_f32
+; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
+  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
+  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
+  ret %struct.float32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld3q_dup_f64
+; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
+  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
+  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
+  ret %struct.float64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld3_dup_s8
+; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
+  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
+  ret %struct.int8x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld3_dup_s16
+; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
+  ret %struct.int16x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld3_dup_s32
+; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
+  ret %struct.int32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld3_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
+  ret %struct.int64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld3_dup_f32
+; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
+  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
+  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
+  ret %struct.float32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld3_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
+  ret %struct.float64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld4q_dup_s8
+; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+  %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
+  %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+  %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
+  %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
+  ret %struct.int8x16x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld4q_dup_s16
+; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
+  %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
+  %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
+  %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
+  ret %struct.int16x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld4q_dup_s32
+; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
+  %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+  %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
+  %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
+  ret %struct.int32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld4q_dup_s64
+; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
+  %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
+  %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
+  ret %struct.int64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld4q_dup_f32
+; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
+  %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
+  %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
+  %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
+  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
+  %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
+  ret %struct.float32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld4q_dup_f64
+; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
+  %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
+  %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
+  %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
+  %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
+  ret %struct.float64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld4_dup_s8
+; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+  %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
+  %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+  %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
+  %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
+  %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
+  %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
+  ret %struct.int8x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld4_dup_s16
+; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+  %0 = bitcast i16* %a to i8*
+  %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
+  %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+  %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
+  %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
+  %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+  %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
+  %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
+  ret %struct.int16x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld4_dup_s32
+; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast i32* %a to i8*
+  %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
+  %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
+  %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
+  %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
+  %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
+  ret %struct.int32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld4_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast i64* %a to i8*
+  %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
+  %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
+  ret %struct.int64x1x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld4_dup_f32
+; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+  %0 = bitcast float* %a to i8*
+  %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+  %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
+  %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+  %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
+  %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+  %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
+  %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
+  %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
+  %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
+  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
+  ret %struct.float32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld4_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = bitcast double* %a to i8*
+  %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
+  %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
+  %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
+  %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
+  %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
+  ret %struct.float64x1x4_t %.fca.0.3.insert
+}
+
+define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vld1q_lane_s8
+; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
+  ret <16 x i8> %vld1_lane
+}
+
+define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vld1q_lane_s16
+; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
+  ret <8 x i16> %vld1_lane
+}
+
+define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vld1q_lane_s32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
+  ret <4 x i32> %vld1_lane
+}
+
+define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vld1q_lane_s64
+; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
+  ret <2 x i64> %vld1_lane
+}
+
+define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vld1q_lane_f32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
+  ret <4 x float> %vld1_lane
+}
+
+define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vld1q_lane_f64
+; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
+  ret <2 x double> %vld1_lane
+}
+
+define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vld1_lane_s8
+; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i8* %a, align 1
+  %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
+  ret <8 x i8> %vld1_lane
+}
+
+define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vld1_lane_s16
+; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i16* %a, align 2
+  %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
+  ret <4 x i16> %vld1_lane
+}
+
+define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vld1_lane_s32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load i32* %a, align 4
+  %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
+  ret <2 x i32> %vld1_lane
+}
+
+define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vld1_lane_s64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load i64* %a, align 8
+  %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
+  ret <1 x i64> %vld1_lane
+}
+
+define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vld1_lane_f32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = load float* %a, align 4
+  %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
+  ret <2 x float> %vld1_lane
+}
+
+define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vld1_lane_f64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+  %0 = load double* %a, align 8
+  %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
+  ret <1 x double> %vld1_lane
+}
+
+define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s16
+; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
+  %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_f32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_f64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s8
+; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
+  %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s16
+; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
+  %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.int64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_f32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
+  %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_f64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
+  %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
+  %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
+  %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
+  ret %struct.float64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s16
+; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int16x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_f32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_f64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s8
+; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int8x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s16
+; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int16x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.int64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_f32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_f64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
+  %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
+  ret %struct.float64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s8
+; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
+  %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int8x16x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s16
+; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int16x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_f32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_f64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s8
+; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
+  %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int8x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s16
+; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
+  %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int16x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.int64x1x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_f32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
+  %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_f64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
+  %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
+  %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
+  %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
+  %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
+  %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
+  %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
+  %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
+  %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
+  ret %struct.float64x1x4_t %.fca.0.3.insert
+}
+
+define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vst1q_lane_s8
+; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <16 x i8> %b, i32 15
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vst1q_lane_s16
+; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i16> %b, i32 7
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vst1q_lane_s32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i32> %b, i32 3
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vst1q_lane_s64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i64> %b, i32 1
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vst1q_lane_f32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x float> %b, i32 3
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vst1q_lane_f64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x double> %b, i32 1
+  store double %0, double* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vst1_lane_s8
+; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <8 x i8> %b, i32 7
+  store i8 %0, i8* %a, align 1
+  ret void
+}
+
+define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vst1_lane_s16
+; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <4 x i16> %b, i32 3
+  store i16 %0, i16* %a, align 2
+  ret void
+}
+
+define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vst1_lane_s32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x i32> %b, i32 1
+  store i32 %0, i32* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vst1_lane_s64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <1 x i64> %b, i32 0
+  store i64 %0, i64* %a, align 8
+  ret void
+}
+
+define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vst1_lane_f32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <2 x float> %b, i32 1
+  store float %0, float* %a, align 4
+  ret void
+}
+
+define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vst1_lane_f64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %0 = extractelement <1 x double> %b, i32 0
+  store double %0, double* %a, align 8
+  ret void
+}
+
+define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s8
+; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
+  ret void
+}
+
+define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s16
+; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
+  ret void
+}
+
+define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_f32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_f64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s8
+; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
+  ret void
+}
+
+define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s16
+; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
+  ret void
+}
+
+define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_f32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_f64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s8
+; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
+  ret void
+}
+
+define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s16
+; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
+  ret void
+}
+
+define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_f32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_f64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s8
+; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
+  ret void
+}
+
+define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s16
+; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
+  ret void
+}
+
+define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_f32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_f64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s8
+; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
+  ret void
+}
+
+define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s16
+; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
+  ret void
+}
+
+define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_f32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
+  ret void
+}
+
+define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_f64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
+  ret void
+}
+
+define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s8
+; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
+  ret void
+}
+
+define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s16
+; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
+  %0 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
+  ret void
+}
+
+define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
+  %0 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
+  %0 = bitcast i64* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
+  ret void
+}
+
+define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_f32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
+  %0 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
+  ret void
+}
+
+define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_f64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+  %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
+  %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
+  %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
+  %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
+  %0 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-simd-ldst.ll b/test/CodeGen/AArch64/neon-simd-ldst.ll
new file mode 100644
index 0000000..afc0901
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-ldst.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define void @test_ldstq_4v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldstq_4v
+; CHECK: ld4     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+; CHECK: st4     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+entry:
+  %tobool62 = icmp eq i32 %count, 0
+  br i1 %tobool62, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.063 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.063, -1
+  %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %io, i32 1)
+  %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
+  tail call void @llvm.arm.neon.vst4.v16i8(i8* %io, <16 x i8> %vld4.fca.0.extract, <16 x i8> %vld4.fca.1.extract, <16 x i8> %vld4.fca.2.extract, <16 x i8> %vld4.fca.3.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+
+define void @test_ldstq_3v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldstq_3v
+; CHECK: ld3     {v0.16b, v1.16b, v2.16b}, [x0]
+; CHECK: st3     {v0.16b, v1.16b, v2.16b}, [x0]
+entry:
+  %tobool47 = icmp eq i32 %count, 0
+  br i1 %tobool47, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.048 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.048, -1
+  %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %io, i32 1)
+  %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
+  tail call void @llvm.arm.neon.vst3.v16i8(i8* %io, <16 x i8> %vld3.fca.0.extract, <16 x i8> %vld3.fca.1.extract, <16 x i8> %vld3.fca.2.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
+
+define void @test_ldstq_2v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldstq_2v
+; CHECK: ld2     {v0.16b, v1.16b}, [x0]
+; CHECK: st2     {v0.16b, v1.16b}, [x0]
+entry:
+  %tobool22 = icmp eq i32 %count, 0
+  br i1 %tobool22, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.023, -1
+  %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %io, i32 1)
+  %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
+  tail call void @llvm.arm.neon.vst2.v16i8(i8* %io, <16 x i8> %vld2.fca.0.extract, <16 x i8> %vld2.fca.1.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
+
+define void @test_ldst_4v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldst_4v
+; CHECK: ld4     {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+; CHECK: st4     {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+entry:
+  %tobool42 = icmp eq i32 %count, 0
+  br i1 %tobool42, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.043 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.043, -1
+  %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %io, i32 1)
+  %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
+  %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
+  %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
+  %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
+  tail call void @llvm.arm.neon.vst4.v8i8(i8* %io, <8 x i8> %vld4.fca.0.extract, <8 x i8> %vld4.fca.1.extract, <8 x i8> %vld4.fca.2.extract, <8 x i8> %vld4.fca.3.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+
+define void @test_ldst_3v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldst_3v
+; CHECK: ld3     {v0.8b, v1.8b, v2.8b}, [x0]
+; CHECK: st3     {v0.8b, v1.8b, v2.8b}, [x0]
+entry:
+  %tobool32 = icmp eq i32 %count, 0
+  br i1 %tobool32, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.033 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.033, -1
+  %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %io, i32 1)
+  %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
+  %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
+  tail call void @llvm.arm.neon.vst3.v8i8(i8* %io, <8 x i8> %vld3.fca.0.extract, <8 x i8> %vld3.fca.1.extract, <8 x i8> %vld3.fca.2.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+
+define void @test_ldst_2v(i8* noalias %io, i32 %count) {
+; CHECK-LABEL: test_ldst_2v
+; CHECK: ld2     {v0.8b, v1.8b}, [x0]
+; CHECK: st2     {v0.8b, v1.8b}, [x0]
+entry:
+  %tobool22 = icmp eq i32 %count, 0
+  br i1 %tobool22, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %count.addr.023 = phi i32 [ %dec, %while.body ], [ %count, %entry ]
+  %dec = add i32 %count.addr.023, -1
+  %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %io, i32 1)
+  %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
+  tail call void @llvm.arm.neon.vst2.v8i8(i8* %io, <8 x i8> %vld2.fca.0.extract, <8 x i8> %vld2.fca.1.extract, i32 1)
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
+
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
new file mode 100644
index 0000000..156fe1d
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
@@ -0,0 +1,354 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+;Check for a post-increment updating load.
+define <4 x i16> @test_vld1_fx_update(i16** %ptr) nounwind {
+; CHECK: test_vld1_fx_update
+; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #8
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 2)
+  %tmp2 = getelementptr i16* %A, i32 4
+  store i16* %tmp2, i16** %ptr
+  ret <4 x i16> %tmp1
+}
+
+;Check for a post-increment updating load with register increment.
+define <2 x i32> @test_vld1_reg_update(i32** %ptr, i32 %inc) nounwind {
+; CHECK: test_vld1_reg_update
+; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i32** %ptr
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 4)
+  %tmp2 = getelementptr i32* %A, i32 %inc
+  store i32* %tmp2, i32** %ptr
+  ret <2 x i32> %tmp1
+}
+
+define <2 x float> @test_vld2_fx_update(float** %ptr) nounwind {
+; CHECK: test_vld2_fx_update
+; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
+  %A = load float** %ptr
+  %tmp0 = bitcast float* %A to i8*
+  %tmp1 = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 4)
+  %tmp2 = extractvalue { <2 x float>, <2 x float> } %tmp1, 0
+  %tmp3 = getelementptr float* %A, i32 4
+  store float* %tmp3, float** %ptr
+  ret <2 x float> %tmp2
+}
+
+define <16 x i8> @test_vld2_reg_update(i8** %ptr, i32 %inc) nounwind {
+; CHECK: test_vld2_reg_update
+; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i8** %ptr
+  %tmp0 = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %A, i32 1)
+  %tmp1 = extractvalue { <16 x i8>, <16 x i8> } %tmp0, 0
+  %tmp2 = getelementptr i8* %A, i32 %inc
+  store i8* %tmp2, i8** %ptr
+  ret <16 x i8> %tmp1
+}
+
+define <4 x i32> @test_vld3_fx_update(i32** %ptr) nounwind {
+; CHECK: test_vld3_fx_update
+; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #48
+  %A = load i32** %ptr
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 4)
+  %tmp2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %tmp1, 0
+  %tmp3 = getelementptr i32* %A, i32 12
+  store i32* %tmp3, i32** %ptr
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i16> @test_vld3_reg_update(i16** %ptr, i32 %inc) nounwind {
+; CHECK: test_vld3_reg_update
+; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 2)
+  %tmp2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %tmp1, 0
+  %tmp3 = getelementptr i16* %A, i32 %inc
+  store i16* %tmp3, i16** %ptr
+  ret <4 x i16> %tmp2
+}
+
+define <8 x i16> @test_vld4_fx_update(i16** %ptr) nounwind {
+; CHECK: test_vld4_fx_update
+; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], #64
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
+  %tmp2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %tmp1, 0
+  %tmp3 = getelementptr i16* %A, i32 32
+  store i16* %tmp3, i16** %ptr
+  ret <8 x i16> %tmp2
+}
+
+define <8 x i8> @test_vld4_reg_update(i8** %ptr, i32 %inc) nounwind {
+; CHECK: test_vld4_reg_update
+; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i8** %ptr
+  %tmp0 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %A, i32 1)
+  %tmp1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %tmp0, 0
+  %tmp2 = getelementptr i8* %A, i32 %inc
+  store i8* %tmp2, i8** %ptr
+  ret <8 x i8> %tmp1
+}
+
+define void @test_vst1_fx_update(float** %ptr, <2 x float> %B) nounwind {
+; CHECK: test_vst1_fx_update
+; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #8
+  %A = load float** %ptr
+  %tmp0 = bitcast float* %A to i8*
+  call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %B, i32 4)
+  %tmp2 = getelementptr float* %A, i32 2
+  store float* %tmp2, float** %ptr
+  ret void
+}
+
+define void @test_vst1_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
+; CHECK: test_vst1_reg_update
+; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %B, i32 2)
+  %tmp1 = getelementptr i16* %A, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret void
+}
+
+define void @test_vst2_fx_update(i64** %ptr, <1 x i64> %B) nounwind {
+; CHECK: test_vst2_fx_update
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}], #16
+  %A = load i64** %ptr
+  %tmp0 = bitcast i64* %A to i8*
+  call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %B, <1 x i64> %B, i32 8)
+  %tmp1 = getelementptr i64* %A, i32 2
+  store i64* %tmp1, i64** %ptr
+  ret void
+}
+
+define void @test_vst2_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
+; CHECK: test_vst2_reg_update
+; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i8** %ptr
+  call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, i32 4)
+  %tmp0 = getelementptr i8* %A, i32 %inc
+  store i8* %tmp0, i8** %ptr
+  ret void
+}
+
+define void @test_vst3_fx_update(i32** %ptr, <2 x i32> %B) nounwind {
+; CHECK: test_vst3_fx_update
+; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}], #24
+  %A = load i32** %ptr
+  %tmp0 = bitcast i32* %A to i8*
+  call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %B, <2 x i32> %B, <2 x i32> %B, i32 4)
+  %tmp1 = getelementptr i32* %A, i32 6
+  store i32* %tmp1, i32** %ptr
+  ret void
+}
+
+define void @test_vst3_reg_update(i16** %ptr, <8 x i16> %B, i32 %inc) nounwind {
+; CHECK: test_vst3_reg_update
+; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %B, <8 x i16> %B, <8 x i16> %B, i32 2)
+  %tmp1 = getelementptr i16* %A, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret void
+}
+
+define void @test_vst4_fx_update(float** %ptr, <4 x float> %B) nounwind {
+; CHECK: test_vst4_fx_update
+; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}], #64
+  %A = load float** %ptr
+  %tmp0 = bitcast float* %A to i8*
+  call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %B, <4 x float> %B, <4 x float> %B, <4 x float> %B, i32 4)
+  %tmp1 = getelementptr float* %A, i32 16
+  store float* %tmp1, float** %ptr
+  ret void
+}
+
+define void @test_vst4_reg_update(i8** %ptr, <8 x i8> %B, i32 %inc) nounwind {
+; CHECK: test_vst4_reg_update
+; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}], x{{[0-9]+}}
+  %A = load i8** %ptr
+  call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, <8 x i8> %B, i32 1)
+  %tmp0 = getelementptr i8* %A, i32 %inc
+  store i8* %tmp0, i8** %ptr
+  ret void
+}
+
+
+declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
+declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
+declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
+
+declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
+declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
+declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
+
+define <16 x i8> @test_vld1x2_fx_update(i8* %a, i8** %ptr) {
+; CHECK: test_vld1x2_fx_update
+; CHECK: ld1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32
+  %1 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8* %a, i32 1)
+  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
+  %tmp1 = getelementptr i8* %a, i32 32
+  store i8* %tmp1, i8** %ptr
+  ret <16 x i8> %2
+}
+
+define <8 x i16> @test_vld1x2_reg_update(i16* %a, i16** %ptr, i32 %inc) {
+; CHECK: test_vld1x2_reg_update
+; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8* %1, i32 2)
+  %3 = extractvalue { <8 x i16>, <8 x i16> } %2, 0
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret <8 x i16> %3
+}
+
+define <2 x i64> @test_vld1x3_fx_update(i64* %a, i64** %ptr) {
+; CHECK: test_vld1x3_fx_update
+; CHECK: ld1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], #48
+  %1 = bitcast i64* %a to i8*
+  %2 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8* %1, i32 8)
+  %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %2, 0
+  %tmp1 = getelementptr i64* %a, i32 6
+  store i64* %tmp1, i64** %ptr
+  ret  <2 x i64> %3
+}
+
+define <8 x i16> @test_vld1x3_reg_update(i16* %a, i16** %ptr, i32 %inc) {
+; CHECK: test_vld1x3_reg_update
+; CHECK: ld1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8* %1, i32 2)
+  %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %2, 0
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret <8 x i16> %3
+}
+
+define <4 x float> @test_vld1x4_fx_update(float* %a, float** %ptr) {
+; CHECK: test_vld1x4_fx_update
+; CHECK: ld1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64
+  %1 = bitcast float* %a to i8*
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
+  %tmp1 = getelementptr float* %a, i32 16
+  store float* %tmp1, float** %ptr
+  ret <4 x float> %3
+}
+
+define <8 x i8> @test_vld1x4_reg_update(i8* readonly %a, i8** %ptr, i32 %inc) #0 {
+; CHECK: test_vld1x4_reg_update
+; CHECK: ld1 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8* %a, i32 1)
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
+  %tmp1 = getelementptr i8* %a, i32 %inc
+  store i8* %tmp1, i8** %ptr
+  ret <8 x i8> %2
+}
+
+define void @test_vst1x2_fx_update(i8* %a, [2 x <16 x i8>] %b.coerce, i8** %ptr) #2 {
+; CHECK: test_vst1x2_fx_update
+; CHECK: st1 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #32
+  %1 = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %2 = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  tail call void @llvm.aarch64.neon.vst1x2.v16i8(i8* %a, <16 x i8> %1, <16 x i8> %2, i32 1)
+  %tmp1 = getelementptr i8* %a, i32 32
+  store i8* %tmp1, i8** %ptr
+  ret void
+}
+
+define void @test_vst1x2_reg_update(i16* %a, [2 x <8 x i16>] %b.coerce, i16** %ptr, i32 %inc) #2 {
+; CHECK: test_vst1x2_reg_update
+; CHECK: st1 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [2 x <8 x i16>] %b.coerce, 0
+  %2 = extractvalue [2 x <8 x i16>] %b.coerce, 1
+  %3 = bitcast i16* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x2.v8i16(i8* %3, <8 x i16> %1, <8 x i16> %2, i32 2)
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret void
+}
+
+define void @test_vst1x3_fx_update(i32* %a, [3 x <2 x i32>] %b.coerce, i32** %ptr) #2 {
+; CHECK: test_vst1x3_fx_update
+; CHECK: st1 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #24
+  %1 = extractvalue [3 x <2 x i32>] %b.coerce, 0
+  %2 = extractvalue [3 x <2 x i32>] %b.coerce, 1
+  %3 = extractvalue [3 x <2 x i32>] %b.coerce, 2
+  %4 = bitcast i32* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v2i32(i8* %4, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, i32 4)
+  %tmp1 = getelementptr i32* %a, i32 6
+  store i32* %tmp1, i32** %ptr
+  ret void
+}
+
+define void @test_vst1x3_reg_update(i64* %a, [3 x <1 x i64>] %b.coerce, i64** %ptr, i32 %inc) #2 {
+; CHECK: test_vst1x3_reg_update
+; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [3 x <1 x i64>] %b.coerce, 0
+  %2 = extractvalue [3 x <1 x i64>] %b.coerce, 1
+  %3 = extractvalue [3 x <1 x i64>] %b.coerce, 2
+  %4 = bitcast i64* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x3.v1i64(i8* %4, <1 x i64> %1, <1 x i64> %2, <1 x i64> %3, i32 8)
+  %tmp1 = getelementptr i64* %a, i32 %inc
+  store i64* %tmp1, i64** %ptr
+  ret void
+}
+
+define void @test_vst1x4_fx_update(float* %a, [4 x <4 x float>] %b.coerce, float** %ptr) #2 {
+; CHECK: test_vst1x4_fx_update
+; CHECK: st1 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], #64
+  %1 = extractvalue [4 x <4 x float>] %b.coerce, 0
+  %2 = extractvalue [4 x <4 x float>] %b.coerce, 1
+  %3 = extractvalue [4 x <4 x float>] %b.coerce, 2
+  %4 = extractvalue [4 x <4 x float>] %b.coerce, 3
+  %5 = bitcast float* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v4f32(i8* %5, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, i32 4)
+  %tmp1 = getelementptr float* %a, i32 16
+  store float* %tmp1, float** %ptr
+  ret void
+}
+
+define void @test_vst1x4_reg_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr, i32 %inc) #2 {
+; CHECK: test_vst1x4_reg_update
+; CHECK: st1 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %5 = bitcast double* %a to i8*
+  tail call void @llvm.aarch64.neon.vst1x4.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 8)
+  %tmp1 = getelementptr double* %a, i32 %inc
+  store double* %tmp1, double** %ptr
+  ret void
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.vld1x2.v16i8(i8*, i32)
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x2.v8i16(i8*, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.vld1x3.v2i64(i8*, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.vld1x3.v8i16(i8*, i32)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.vld1x4.v4f32(i8*, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.vld1x4.v8i8(i8*, i32)
+declare void @llvm.aarch64.neon.vst1x2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
+declare void @llvm.aarch64.neon.vst1x2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
+declare void @llvm.aarch64.neon.vst1x3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
+declare void @llvm.aarch64.neon.vst1x4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #3
+declare void @llvm.aarch64.neon.vst1x4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32) #3
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
new file mode 100644
index 0000000..80a9347
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
@@ -0,0 +1,319 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) {
+; CHECK-LABEL: test_vld2q_dup_fx_update
+; CHECK: ld2r  {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #2
+  %1 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+  %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
+  %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+  %4 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
+  %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer
+  %6 = insertvalue { [2 x <16 x i8>] } undef, <16 x i8> %3, 0, 0
+  %7 = insertvalue { [2 x <16 x i8>] } %6, <16 x i8> %5, 0, 1
+  %tmp1 = getelementptr i8* %a, i32 2
+  store i8* %tmp1, i8** %ptr
+  ret { [2 x <16 x i8>] } %7
+}
+
+define { [2 x <4 x i32>] } @test_vld2q_dup_reg_update(i32* %a, i32** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld2q_dup_reg_update
+; CHECK: ld2r  {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %1, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+  %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+  %5 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
+  %7 = insertvalue { [2 x <4 x i32>] } undef, <4 x i32> %4, 0, 0
+  %8 = insertvalue { [2 x <4 x i32>] } %7, <4 x i32> %6, 0, 1
+  %tmp1 = getelementptr i32* %a, i32 %inc
+  store i32* %tmp1, i32** %ptr
+  ret { [2 x <4 x i32>] } %8
+}
+
+define { [3 x <4 x i16>] } @test_vld3_dup_fx_update(i16* %a, i16** %ptr) {
+; CHECK-LABEL: test_vld3_dup_fx_update
+; CHECK: ld3r  {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #6
+  %1 = bitcast i16* %a to i8*
+  %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %1, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+  %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
+  %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+  %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
+  %6 = shufflevector <4 x i16> %5, <4 x i16> undef, <4 x i32> zeroinitializer
+  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
+  %8 = shufflevector <4 x i16> %7, <4 x i16> undef, <4 x i32> zeroinitializer
+  %9 = insertvalue { [3 x <4 x i16>] }  undef, <4 x i16> %4, 0, 0
+  %10 = insertvalue { [3 x <4 x i16>] }  %9, <4 x i16> %6, 0, 1
+  %11 = insertvalue { [3 x <4 x i16>] }  %10, <4 x i16> %8, 0, 2
+  %tmp1 = getelementptr i16* %a, i32 3
+  store i16* %tmp1, i16** %ptr
+  ret { [3 x <4 x i16>] }  %11
+}
+
+define { [3 x <8 x i8>] } @test_vld3_dup_reg_update(i8* %a, i8** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld3_dup_reg_update
+; CHECK: ld3r  {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+  %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
+  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+  %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
+  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <8 x i32> zeroinitializer
+  %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
+  %7 = shufflevector <8 x i8> %6, <8 x i8> undef, <8 x i32> zeroinitializer
+  %8 = insertvalue { [3 x <8 x i8>] } undef, <8 x i8> %3, 0, 0
+  %9 = insertvalue { [3 x <8 x i8>] } %8, <8 x i8> %5, 0, 1
+  %10 = insertvalue { [3 x <8 x i8>] } %9, <8 x i8> %7, 0, 2
+  %tmp1 = getelementptr i8* %a, i32 %inc
+  store i8* %tmp1, i8** %ptr
+  ret { [3 x <8 x i8>] }%10
+}
+
+define { [4 x <2 x i32>] } @test_vld4_dup_fx_update(i32* %a, i32** %ptr) #0 {
+; CHECK-LABEL: test_vld4_dup_fx_update
+; CHECK: ld4r  {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
+  %1 = bitcast i32* %a to i8*
+  %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+  %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
+  %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+  %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
+  %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <2 x i32> zeroinitializer
+  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
+  %8 = shufflevector <2 x i32> %7, <2 x i32> undef, <2 x i32> zeroinitializer
+  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
+  %10 = shufflevector <2 x i32> %9, <2 x i32> undef, <2 x i32> zeroinitializer
+  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %4, 0, 0
+  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %6, 0, 1
+  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %8, 0, 2
+  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
+  %tmp1 = getelementptr i32* %a, i32 4
+  store i32* %tmp1, i32** %ptr
+  ret { [4 x <2 x i32>] } %14
+}
+
+define { [4 x <2 x double>] } @test_vld4_dup_reg_update(double* %a, double** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld4_dup_reg_update
+; CHECK: ld4r  {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = bitcast double* %a to i8*
+  %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %1, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+  %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
+  %4 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+  %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
+  %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer
+  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
+  %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer
+  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
+  %10 = shufflevector <2 x double> %9, <2 x double> undef, <2 x i32> zeroinitializer
+  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %4, 0, 0
+  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %6, 0, 1
+  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %8, 0, 2
+  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
+  %tmp1 = getelementptr double* %a, i32 %inc
+  store double* %tmp1, double** %ptr
+  ret { [4 x <2 x double>] } %14
+}
+
+define { [2 x <8 x i8>] } @test_vld2_lane_fx_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr) {
+; CHECK-LABEL: test_vld2_lane_fx_update
+; CHECK: ld2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
+  %1 = extractvalue [2 x <8 x i8>] %b, 0
+  %2 = extractvalue [2 x <8 x i8>] %b, 1
+  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
+  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
+  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
+  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
+  %tmp1 = getelementptr i8* %a, i32 2
+  store i8* %tmp1, i8** %ptr
+  ret { [2 x <8 x i8>] } %7
+}
+
+define { [2 x <8 x i8>] } @test_vld2_lane_reg_update(i8*  %a, [2 x <8 x i8>] %b, i8** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld2_lane_reg_update
+; CHECK: ld2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[6], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [2 x <8 x i8>] %b, 0
+  %2 = extractvalue [2 x <8 x i8>] %b, 1
+  %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 6, i32 1)
+  %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
+  %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
+  %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
+  %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
+  %tmp1 = getelementptr i8* %a, i32 %inc
+  store i8* %tmp1, i8** %ptr
+  ret { [2 x <8 x i8>] } %7
+}
+
+define { [3 x <2 x float>] } @test_vld3_lane_fx_update(float* %a, [3 x <2 x float>] %b, float** %ptr) {
+; CHECK-LABEL: test_vld3_lane_fx_update
+; CHECK: ld3  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #12
+  %1 = extractvalue [3 x <2 x float>] %b, 0
+  %2 = extractvalue [3 x <2 x float>] %b, 1
+  %3 = extractvalue [3 x <2 x float>] %b, 2
+  %4 = bitcast float* %a to i8*
+  %5 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4)
+  %6 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 0
+  %7 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 1
+  %8 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 2
+  %9 = insertvalue { [3 x <2 x float>] } undef, <2 x float> %6, 0, 0
+  %10 = insertvalue { [3 x <2 x float>] } %9, <2 x float> %7, 0, 1
+  %11 = insertvalue { [3 x <2 x float>] } %10, <2 x float> %8, 0, 2
+  %tmp1 = getelementptr float* %a, i32 3
+  store float* %tmp1, float** %ptr
+  ret { [3 x <2 x float>] } %11
+}
+
+define { [3 x <4 x i16>] } @test_vld3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld3_lane_reg_update
+; CHECK: ld3  {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [3 x <4 x i16>] %b, 0
+  %2 = extractvalue [3 x <4 x i16>] %b, 1
+  %3 = extractvalue [3 x <4 x i16>] %b, 2
+  %4 = bitcast i16* %a to i8*
+  %5 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+  %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 0
+  %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 1
+  %8 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 2
+  %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %6, 0, 0
+  %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %7, 0, 1
+  %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret { [3 x <4 x i16>] } %11
+}
+
+define { [4 x <2 x i32>] } @test_vld4_lane_fx_update(i32* readonly %a, [4 x <2 x i32>] %b, i32** %ptr) {
+; CHECK-LABEL: test_vld4_lane_fx_update
+; CHECK: ld4  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #16
+  %1 = extractvalue [4 x <2 x i32>] %b, 0
+  %2 = extractvalue [4 x <2 x i32>] %b, 1
+  %3 = extractvalue [4 x <2 x i32>] %b, 2
+  %4 = extractvalue [4 x <2 x i32>] %b, 3
+  %5 = bitcast i32* %a to i8*
+  %6 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 1, i32 4)
+  %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 0
+  %8 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 1
+  %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 2
+  %10 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 3
+  %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %7, 0, 0
+  %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %8, 0, 1
+  %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %9, 0, 2
+  %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
+  %tmp1 = getelementptr i32* %a, i32 4
+  store i32* %tmp1, i32** %ptr
+  ret { [4 x <2 x i32>] } %14
+}
+
+define { [4 x <2 x double>] } @test_vld4_lane_reg_update(double* readonly %a, [4 x <2 x double>] %b, double** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld4_lane_reg_update
+; CHECK: ld4  {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [4 x <2 x double>] %b, 0
+  %2 = extractvalue [4 x <2 x double>] %b, 1
+  %3 = extractvalue [4 x <2 x double>] %b, 2
+  %4 = extractvalue [4 x <2 x double>] %b, 3
+  %5 = bitcast double* %a to i8*
+  %6 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
+  %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 0
+  %8 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 1
+  %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 2
+  %10 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 3
+  %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %7, 0, 0
+  %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %8, 0, 1
+  %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %9, 0, 2
+  %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
+  %tmp1 = getelementptr double* %a, i32 %inc
+  store double* %tmp1, double** %ptr
+  ret { [4 x <2 x double>] } %14
+}
+
+define void @test_vst2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) {
+; CHECK-LABEL: test_vst2_lane_fx_update
+; CHECK: st2  {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
+  %1 = extractvalue [2 x <8 x i8>] %b, 0
+  %2 = extractvalue [2 x <8 x i8>] %b, 1
+  call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+  %tmp1 = getelementptr i8* %a, i32 2
+  store i8* %tmp1, i8** %ptr
+  ret void
+}
+
+define void @test_vst2_lane_reg_update(i32* %a, [2 x <2 x i32>] %b.coerce, i32** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst2_lane_reg_update
+; CHECK: st2  {v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [2 x <2 x i32>] %b.coerce, 0
+  %2 = extractvalue [2 x <2 x i32>] %b.coerce, 1
+  %3 = bitcast i32* %a to i8*
+  tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
+  %tmp1 = getelementptr i32* %a, i32 %inc
+  store i32* %tmp1, i32** %ptr
+  ret void
+}
+
+define void @test_vst3_lane_fx_update(float* %a, [3 x <4 x float>] %b, float** %ptr) {
+; CHECK-LABEL: test_vst3_lane_fx_update
+; CHECK: st3  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[3], [x{{[0-9]+|sp}}], #12
+  %1 = extractvalue [3 x <4 x float>] %b, 0
+  %2 = extractvalue [3 x <4 x float>] %b, 1
+  %3 = extractvalue [3 x <4 x float>] %b, 2
+  %4 = bitcast float* %a to i8*
+  call void @llvm.arm.neon.vst3lane.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 3, i32 4)
+  %tmp1 = getelementptr float* %a, i32 3
+  store float* %tmp1, float** %ptr
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @test_vst3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst3_lane_reg_update
+; CHECK: st3  {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [3 x <4 x i16>] %b, 0
+  %2 = extractvalue [3 x <4 x i16>] %b, 1
+  %3 = extractvalue [3 x <4 x i16>] %b, 2
+  %4 = bitcast i16* %a to i8*
+  tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+  %tmp1 = getelementptr i16* %a, i32 %inc
+  store i16* %tmp1, i16** %ptr
+  ret void
+}
+
+define void @test_vst4_lane_fx_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr) {
+; CHECK-LABEL: test_vst4_lane_fx_update
+; CHECK: st4  {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], #32
+  %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
+  %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
+  %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
+  %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
+  %5 = bitcast double* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
+  %tmp1 = getelementptr double* %a, i32 4
+  store double* %tmp1, double** %ptr
+  ret void
+}
+
+
+define void @test_vst4_lane_reg_update(float* %a, [4 x <2 x float>] %b.coerce, float** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst4_lane_reg_update
+; CHECK: st4  {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+  %1 = extractvalue [4 x <2 x float>] %b.coerce, 0
+  %2 = extractvalue [4 x <2 x float>] %b.coerce, 1
+  %3 = extractvalue [4 x <2 x float>] %b.coerce, 2
+  %4 = extractvalue [4 x <2 x float>] %b.coerce, 3
+  %5 = bitcast float* %a to i8*
+  tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 1, i32 4)
+  %tmp1 = getelementptr float* %a, i32 %inc
+  store float* %tmp1, float** %ptr
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
diff --git a/test/CodeGen/AArch64/neon-simd-shift.ll b/test/CodeGen/AArch64/neon-simd-shift.ll
new file mode 100644
index 0000000..fd76265
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-shift.ll
@@ -0,0 +1,1556 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
+; CHECK: test_vshr_n_s8
+; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vshr_n = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) {
+; CHECK: test_vshr_n_s16
+; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vshr_n = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) {
+; CHECK: test_vshr_n_s32
+; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vshr_n = ashr <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_s8
+; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vshr_n = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_s16
+; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vshr_n = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_s32
+; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vshr_n = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_s64
+; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vshr_n = ashr <2 x i64> %a, <i64 3, i64 3>
+  ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) {
+; CHECK: test_vshr_n_u8
+; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vshr_n = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <8 x i8> %vshr_n
+}
+
+define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) {
+; CHECK: test_vshr_n_u16
+; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vshr_n = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %vshr_n
+}
+
+define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) {
+; CHECK: test_vshr_n_u32
+; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vshr_n = lshr <2 x i32> %a, <i32 3, i32 3>
+  ret <2 x i32> %vshr_n
+}
+
+define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
+; CHECK: test_vshrq_n_u8
+; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vshr_n = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <16 x i8> %vshr_n
+}
+
+define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrq_n_u16
+; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vshr_n = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ret <8 x i16> %vshr_n
+}
+
+define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrq_n_u32
+; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vshr_n = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %vshr_n
+}
+
+define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrq_n_u64
+; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vshr_n = lshr <2 x i64> %a, <i64 3, i64 3>
+  ret <2 x i64> %vshr_n
+}
+
+define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_s8
+; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsra_n = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <8 x i8> %vsra_n, %a
+  ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_s16
+; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsra_n = ashr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+  %1 = add <4 x i16> %vsra_n, %a
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_s32
+; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsra_n = ashr <2 x i32> %b, <i32 3, i32 3>
+  %1 = add <2 x i32> %vsra_n, %a
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_s8
+; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsra_n = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <16 x i8> %vsra_n, %a
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_s16
+; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsra_n = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %1 = add <8 x i16> %vsra_n, %a
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_s32
+; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsra_n = ashr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %vsra_n, %a
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_s64
+; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsra_n = ashr <2 x i64> %b, <i64 3, i64 3>
+  %1 = add <2 x i64> %vsra_n, %a
+  ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsra_n_u8
+; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsra_n = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <8 x i8> %vsra_n, %a
+  ret <8 x i8> %1
+}
+
+define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsra_n_u16
+; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsra_n = lshr <4 x i16> %b, <i16 3, i16 3, i16 3, i16 3>
+  %1 = add <4 x i16> %vsra_n, %a
+  ret <4 x i16> %1
+}
+
+define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsra_n_u32
+; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsra_n = lshr <2 x i32> %b, <i32 3, i32 3>
+  %1 = add <2 x i32> %vsra_n, %a
+  ret <2 x i32> %1
+}
+
+define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsraq_n_u8
+; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsra_n = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %1 = add <16 x i8> %vsra_n, %a
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsraq_n_u16
+; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsra_n = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %1 = add <8 x i16> %vsra_n, %a
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsraq_n_u32
+; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsra_n = lshr <4 x i32> %b, <i32 3, i32 3, i32 3, i32 3>
+  %1 = add <4 x i32> %vsra_n, %a
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsraq_n_u64
+; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsra_n = lshr <2 x i64> %b, <i64 3, i64 3>
+  %1 = add <2 x i64> %vsra_n, %a
+  ret <2 x i64> %1
+}
+
+define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) {
+; CHECK: test_vrshr_n_s8
+; CHECK: srshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %a, i32 3)
+  ret <8 x i8> %vrshr_n
+}
+
+
+define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) {
+; CHECK: test_vrshr_n_s16
+; CHECK: srshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %a, i32 3)
+  ret <4 x i16> %vrshr_n
+}
+
+
+define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) {
+; CHECK: test_vrshr_n_s32
+; CHECK: srshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %a, i32 3)
+  ret <2 x i32> %vrshr_n
+}
+
+
+define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) {
+; CHECK: test_vrshrq_n_s8
+; CHECK: srshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %a, i32 3)
+  ret <16 x i8> %vrshr_n
+}
+
+
+define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) {
+; CHECK: test_vrshrq_n_s16
+; CHECK: srshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %a, i32 3)
+  ret <8 x i16> %vrshr_n
+}
+
+
+define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) {
+; CHECK: test_vrshrq_n_s32
+; CHECK: srshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %a, i32 3)
+  ret <4 x i32> %vrshr_n
+}
+
+
+define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) {
+; CHECK: test_vrshrq_n_s64
+; CHECK: srshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %a, i32 3)
+  ret <2 x i64> %vrshr_n
+}
+
+
+define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) {
+; CHECK: test_vrshr_n_u8
+; CHECK: urshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vrshr_n = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %a, i32 3)
+  ret <8 x i8> %vrshr_n
+}
+
+
+define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) {
+; CHECK: test_vrshr_n_u16
+; CHECK: urshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vrshr_n = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %a, i32 3)
+  ret <4 x i16> %vrshr_n
+}
+
+
+define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) {
+; CHECK: test_vrshr_n_u32
+; CHECK: urshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vrshr_n = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %a, i32 3)
+  ret <2 x i32> %vrshr_n
+}
+
+
+define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) {
+; CHECK: test_vrshrq_n_u8
+; CHECK: urshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vrshr_n = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %a, i32 3)
+  ret <16 x i8> %vrshr_n
+}
+
+
+define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) {
+; CHECK: test_vrshrq_n_u16
+; CHECK: urshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vrshr_n = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %a, i32 3)
+  ret <8 x i16> %vrshr_n
+}
+
+
+define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) {
+; CHECK: test_vrshrq_n_u32
+; CHECK: urshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vrshr_n = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %a, i32 3)
+  ret <4 x i32> %vrshr_n
+}
+
+
+define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) {
+; CHECK: test_vrshrq_n_u64
+; CHECK: urshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vrshr_n = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %a, i32 3)
+  ret <2 x i64> %vrshr_n
+}
+
+
+define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vrsra_n_s8
+; CHECK: srsra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %1 = tail call <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8> %b, i32 3)
+  %vrsra_n = add <8 x i8> %1, %a
+  ret <8 x i8> %vrsra_n
+}
+
+define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vrsra_n_s16
+; CHECK: srsra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %1 = tail call <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16> %b, i32 3)
+  %vrsra_n = add <4 x i16> %1, %a
+  ret <4 x i16> %vrsra_n
+}
+
+define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vrsra_n_s32
+; CHECK: srsra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %1 = tail call <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32> %b, i32 3)
+  %vrsra_n = add <2 x i32> %1, %a
+  ret <2 x i32> %vrsra_n
+}
+
+define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vrsraq_n_s8
+; CHECK: srsra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %1 = tail call <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8> %b, i32 3)
+  %vrsra_n = add <16 x i8> %1, %a
+  ret <16 x i8> %vrsra_n
+}
+
+define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsraq_n_s16
+; CHECK: srsra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %1 = tail call <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16> %b, i32 3)
+  %vrsra_n = add <8 x i16> %1, %a
+  ret <8 x i16> %vrsra_n
+}
+
+define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsraq_n_s32
+; CHECK: srsra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %1 = tail call <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32> %b, i32 3)
+  %vrsra_n = add <4 x i32> %1, %a
+  ret <4 x i32> %vrsra_n
+}
+
+define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsraq_n_s64
+; CHECK: srsra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %1 = tail call <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64> %b, i32 3)
+  %vrsra_n = add <2 x i64> %1, %a
+  ret <2 x i64> %vrsra_n
+}
+
+define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vrsra_n_u8
+; CHECK: ursra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %1 = tail call <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8> %b, i32 3)
+  %vrsra_n = add <8 x i8> %1, %a
+  ret <8 x i8> %vrsra_n
+}
+
+define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vrsra_n_u16
+; CHECK: ursra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %1 = tail call <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16> %b, i32 3)
+  %vrsra_n = add <4 x i16> %1, %a
+  ret <4 x i16> %vrsra_n
+}
+
+define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vrsra_n_u32
+; CHECK: ursra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %1 = tail call <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32> %b, i32 3)
+  %vrsra_n = add <2 x i32> %1, %a
+  ret <2 x i32> %vrsra_n
+}
+
+define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vrsraq_n_u8
+; CHECK: ursra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %1 = tail call <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8> %b, i32 3)
+  %vrsra_n = add <16 x i8> %1, %a
+  ret <16 x i8> %vrsra_n
+}
+
+define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vrsraq_n_u16
+; CHECK: ursra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %1 = tail call <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16> %b, i32 3)
+  %vrsra_n = add <8 x i16> %1, %a
+  ret <8 x i16> %vrsra_n
+}
+
+define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vrsraq_n_u32
+; CHECK: ursra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %1 = tail call <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32> %b, i32 3)
+  %vrsra_n = add <4 x i32> %1, %a
+  ret <4 x i32> %vrsra_n
+}
+
+define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vrsraq_n_u64
+; CHECK: ursra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %1 = tail call <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64> %b, i32 3)
+  %vrsra_n = add <2 x i64> %1, %a
+  ret <2 x i64> %vrsra_n
+}
+
+define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsri_n_s8
+; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+  ret <8 x i8> %vsri_n
+}
+
+
+define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsri_n_s16
+; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
+  ret <4 x i16> %vsri
+}
+
+
+define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsri_n_s32
+; CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsri = tail call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
+  ret <2 x i32> %vsri
+}
+
+
+define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsriq_n_s8
+; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+  ret <16 x i8> %vsri_n
+}
+
+
+define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsriq_n_s16
+; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
+  ret <8 x i16> %vsri
+}
+
+
+define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsriq_n_s32
+; CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsri = tail call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
+  ret <4 x i32> %vsri
+}
+
+
+define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsriq_n_s64
+; CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsri = tail call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
+  ret <2 x i64> %vsri
+}
+
+define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsri_n_p8
+; CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsri_n = tail call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+  ret <8 x i8> %vsri_n
+}
+
+define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsri_n_p16
+; CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
+  %vsri = tail call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
+  ret <4 x i16> %vsri
+}
+
+define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsriq_n_p8
+; CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsri_n = tail call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+  ret <16 x i8> %vsri_n
+}
+
+define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsriq_n_p16
+; CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
+  %vsri = tail call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
+  ret <8 x i16> %vsri
+}
+
+define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsli_n_s8
+; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+  ret <8 x i8> %vsli_n
+}
+
+define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsli_n_s16
+; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 3)
+  ret <4 x i16> %vsli
+}
+
+define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK: test_vsli_n_s32
+; CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vsli = tail call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %a, <2 x i32> %b, i32 3)
+  ret <2 x i32> %vsli
+}
+
+define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsliq_n_s8
+; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+  ret <16 x i8> %vsli_n
+}
+
+define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsliq_n_s16
+; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 3)
+  ret <8 x i16> %vsli
+}
+
+define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK: test_vsliq_n_s32
+; CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vsli = tail call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %a, <4 x i32> %b, i32 3)
+  ret <4 x i32> %vsli
+}
+
+define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK: test_vsliq_n_s64
+; CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vsli = tail call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %a, <2 x i64> %b, i32 3)
+  ret <2 x i64> %vsli
+}
+
+define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vsli_n_p8
+; CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vsli_n = tail call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+  ret <8 x i8> %vsli_n
+}
+
+define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK: test_vsli_n_p16
+; CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
+  %vsli = tail call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %a, <4 x i16> %b, i32 15)
+  ret <4 x i16> %vsli
+}
+
+define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vsliq_n_p8
+; CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vsli_n = tail call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+  ret <16 x i8> %vsli_n
+}
+
+define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: test_vsliq_n_p16
+; CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
+  %vsli = tail call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %a, <8 x i16> %b, i32 15)
+  ret <8 x i16> %vsli
+}
+
+define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) {
+; CHECK: test_vqshl_n_s8
+; CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vqshl = tail call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <8 x i8> %vqshl
+}
+
+
+define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) {
+; CHECK: test_vqshl_n_s16
+; CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
+  ret <4 x i16> %vqshl
+}
+
+
+define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) {
+; CHECK: test_vqshl_n_s32
+; CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
+  ret <2 x i32> %vqshl
+}
+
+
+define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) {
+; CHECK: test_vqshlq_n_s8
+; CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %vqshl_n
+}
+
+
+define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) {
+; CHECK: test_vqshlq_n_s16
+; CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %vqshl
+}
+
+
+define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) {
+; CHECK: test_vqshlq_n_s32
+; CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %vqshl
+}
+
+
+define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) {
+; CHECK: test_vqshlq_n_s64
+; CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
+  ret <2 x i64> %vqshl
+}
+
+
+define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) {
+; CHECK: test_vqshl_n_u8
+; CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vqshl_n = tail call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <8 x i8> %vqshl_n
+}
+
+
+define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) {
+; CHECK: test_vqshl_n_u16
+; CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vqshl = tail call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
+  ret <4 x i16> %vqshl
+}
+
+
+define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) {
+; CHECK: test_vqshl_n_u32
+; CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vqshl = tail call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> <i32 3, i32 3>)
+  ret <2 x i32> %vqshl
+}
+
+
+define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) {
+; CHECK: test_vqshlq_n_u8
+; CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vqshl_n = tail call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+  ret <16 x i8> %vqshl_n
+}
+
+
+define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) {
+; CHECK: test_vqshlq_n_u16
+; CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vqshl = tail call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+  ret <8 x i16> %vqshl
+}
+
+
+define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) {
+; CHECK: test_vqshlq_n_u32
+; CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vqshl = tail call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %vqshl
+}
+
+
+define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) {
+; CHECK: test_vqshlq_n_u64
+; CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vqshl = tail call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> <i64 3, i64 3>)
+  ret <2 x i64> %vqshl
+}
+
+define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) {
+; CHECK: test_vqshlu_n_s8
+; CHECK: sqshlu {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
+  %vqshlu = tail call <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8> %a, i32 3)
+  ret <8 x i8> %vqshlu
+}
+
+
+define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) {
+; CHECK: test_vqshlu_n_s16
+; CHECK: sqshlu {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
+  %vqshlu = tail call <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16> %a, i32 3)
+  ret <4 x i16> %vqshlu
+}
+
+
+define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) {
+; CHECK: test_vqshlu_n_s32
+; CHECK: sqshlu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
+  %vqshlu = tail call <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32> %a, i32 3)
+  ret <2 x i32> %vqshlu
+}
+
+
+define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) {
+; CHECK: test_vqshluq_n_s8
+; CHECK: sqshlu {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
+  %vqshlu = tail call <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8> %a, i32 3)
+  ret <16 x i8> %vqshlu
+}
+
+
+define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) {
+; CHECK: test_vqshluq_n_s16
+; CHECK: sqshlu {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
+  %vqshlu = tail call <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16> %a, i32 3)
+  ret <8 x i16> %vqshlu
+}
+
+
+define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) {
+; CHECK: test_vqshluq_n_s32
+; CHECK: sqshlu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
+  %vqshlu = tail call <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32> %a, i32 3)
+  ret <4 x i32> %vqshlu
+}
+
+
+define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) {
+; CHECK: test_vqshluq_n_s64
+; CHECK: sqshlu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
+  %vqshlu = tail call <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64> %a, i32 3)
+  ret <2 x i64> %vqshlu
+}
+
+
+define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_s16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %1 = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_s32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %1 = ashr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_s64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %1 = ashr <2 x i64> %a, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %vshrn_n
+}
+
+define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) {
+; CHECK: test_vshrn_n_u16
+; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %1 = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  ret <8 x i8> %vshrn_n
+}
+
+define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) {
+; CHECK: test_vshrn_n_u32
+; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %1 = lshr <4 x i32> %a, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  ret <4 x i16> %vshrn_n
+}
+
+define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) {
+; CHECK: test_vshrn_n_u64
+; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %1 = lshr <2 x i64> %a, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %1 to <2 x i32>
+  ret <2 x i32> %vshrn_n
+}
+
+define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_s16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %1 = ashr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  %2 = bitcast <8 x i8> %a to <1 x i64>
+  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_s32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %1 = ashr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  %2 = bitcast <4 x i16> %a to <1 x i64>
+  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_s64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = ashr <2 x i64> %b, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vshrn_high_n_u16
+; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %1 = lshr <8 x i16> %b, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  %vshrn_n = trunc <8 x i16> %1 to <8 x i8>
+  %2 = bitcast <8 x i8> %a to <1 x i64>
+  %3 = bitcast <8 x i8> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %4
+}
+
+define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vshrn_high_n_u32
+; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %1 = lshr <4 x i32> %b, <i32 9, i32 9, i32 9, i32 9>
+  %vshrn_n = trunc <4 x i32> %1 to <4 x i16>
+  %2 = bitcast <4 x i16> %a to <1 x i64>
+  %3 = bitcast <4 x i16> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vshrn_high_n_u64
+; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %2 = lshr <2 x i64> %b, <i64 19, i64 19>
+  %vshrn_n = trunc <2 x i64> %2 to <2 x i32>
+  %3 = bitcast <2 x i32> %vshrn_n to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> <i32 0, i32 1>
+  %4 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %4
+}
+
+define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) {
+; CHECK: test_vqshrun_n_s16
+; CHECK: sqshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqshrun
+}
+
+
+define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) {
+; CHECK: test_vqshrun_n_s32
+; CHECK: sqshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqshrun
+}
+
+define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) {
+; CHECK: test_vqshrun_n_s64
+; CHECK: sqshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqshrun
+}
+
+define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrun_high_n_s16
+; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrun_high_n_s32
+; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrun_high_n_s64
+; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vrshrn_n_s16
+; CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vrshrn
+}
+
+
+define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vrshrn_n_s32
+; CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vrshrn
+}
+
+
+define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vrshrn_n_s64
+; CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vrshrn
+}
+
+define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vrshrn_high_n_s16
+; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vrshrn_high_n_s32
+; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vrshrn_high_n_s64
+; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) {
+; CHECK: test_vqrshrun_n_s16
+; CHECK: sqrshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqrshrun
+}
+
+define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) {
+; CHECK: test_vqrshrun_n_s32
+; CHECK: sqrshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqrshrun
+}
+
+define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) {
+; CHECK: test_vqrshrun_n_s64
+; CHECK: sqrshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqrshrun
+}
+
+define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrun_high_n_s16
+; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrun_high_n_s32
+; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrun_high_n_s64
+; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vqshrn_n_s16
+; CHECK: sqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqshrn
+}
+
+
+define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vqshrn_n_s32
+; CHECK: sqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqshrn
+}
+
+
+define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vqshrn_n_s64
+; CHECK: sqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqshrn
+}
+
+
+define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) {
+; CHECK: test_vqshrn_n_u16
+; CHECK: uqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqshrn
+}
+
+
+define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) {
+; CHECK: test_vqshrn_n_u32
+; CHECK: uqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqshrn
+}
+
+
+define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) {
+; CHECK: test_vqshrn_n_u64
+; CHECK: uqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqshrn
+}
+
+
+define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_s16
+; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_s32
+; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_s64
+; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqshrn_high_n_u16
+; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqshrn_high_n_u32
+; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqshrn_high_n_u64
+; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) {
+; CHECK: test_vqrshrn_n_s16
+; CHECK: sqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqrshrn
+}
+
+
+define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) {
+; CHECK: test_vqrshrn_n_s32
+; CHECK: sqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqrshrn
+}
+
+
+define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) {
+; CHECK: test_vqrshrn_n_s64
+; CHECK: sqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqrshrn
+}
+
+
+define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) {
+; CHECK: test_vqrshrn_n_u16
+; CHECK: uqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %a, i32 3)
+  ret <8 x i8> %vqrshrn
+}
+
+
+define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) {
+; CHECK: test_vqrshrn_n_u32
+; CHECK: uqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %a, i32 9)
+  ret <4 x i16> %vqrshrn
+}
+
+
+define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) {
+; CHECK: test_vqrshrn_n_u64
+; CHECK: uqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %a, i32 19)
+  ret <2 x i32> %vqrshrn
+}
+
+
+define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_s16
+; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_s32
+; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_s64
+; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
+; CHECK: test_vqrshrn_high_n_u16
+; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %1 = bitcast <8 x i8> %a to <1 x i64>
+  %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
+; CHECK: test_vqrshrn_high_n_u32
+; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %1 = bitcast <4 x i16> %a to <1 x i64>
+  %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) {
+; CHECK: test_vqrshrn_high_n_u64
+; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
+  %1 = bitcast <2 x i32> %a to <1 x i64>
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) {
+; CHECK: test_vcvt_n_f32_s32
+; CHECK: scvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+  %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
+  ret <2 x float> %vcvt
+}
+
+define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) {
+; CHECK: test_vcvtq_n_f32_s32
+; CHECK: scvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+  %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
+  ret <4 x float> %vcvt
+}
+
+define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) {
+; CHECK: test_vcvtq_n_f64_s64
+; CHECK: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
+  %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
+  ret <2 x double> %vcvt
+}
+
+define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) {
+; CHECK: test_vcvt_n_f32_u32
+; CHECK: ucvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+  %vcvt = tail call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 31)
+  ret <2 x float> %vcvt
+}
+
+define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) {
+; CHECK: test_vcvtq_n_f32_u32
+; CHECK: ucvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+  %vcvt = tail call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 31)
+  ret <4 x float> %vcvt
+}
+
+define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) {
+; CHECK: test_vcvtq_n_f64_u64
+; CHECK: ucvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
+  %vcvt = tail call <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 50)
+  ret <2 x double> %vcvt
+}
+
+define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) {
+; CHECK: test_vcvt_n_s32_f32
+; CHECK: fcvtzs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+  %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %a, i32 31)
+  ret <2 x i32> %vcvt
+}
+
+define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) {
+; CHECK: test_vcvtq_n_s32_f32
+; CHECK: fcvtzs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+  %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %a, i32 31)
+  ret <4 x i32> %vcvt
+}
+
+define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) {
+; CHECK: test_vcvtq_n_s64_f64
+; CHECK: fcvtzs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
+  %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %a, i32 50)
+  ret <2 x i64> %vcvt
+}
+
+define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) {
+; CHECK: test_vcvt_n_u32_f32
+; CHECK: fcvtzu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+  %vcvt = tail call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %a, i32 31)
+  ret <2 x i32> %vcvt
+}
+
+define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) {
+; CHECK: test_vcvt_n_u32_f32
+; CHECK: fcvtzu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+  %vcvt = tail call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %a, i32 31)
+  ret <4 x i32> %vcvt
+}
+
+define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) {
+; CHECK: test_vcvtq_n_u64_f64
+; CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
+  %vcvt = tail call <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %a, i32 50)
+  ret <2 x i64> %vcvt
+}
+
+declare <8 x i8> @llvm.aarch64.neon.vsrshr.v8i8(<8 x i8>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsrshr.v4i16(<4 x i16>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsrshr.v2i32(<2 x i32>, i32)
+
+declare <16 x i8> @llvm.aarch64.neon.vsrshr.v16i8(<16 x i8>, i32)
+
+declare <8 x i16> @llvm.aarch64.neon.vsrshr.v8i16(<8 x i16>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vsrshr.v4i32(<4 x i32>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vsrshr.v2i64(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vurshr.v8i8(<8 x i8>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vurshr.v4i16(<4 x i16>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vurshr.v2i32(<2 x i32>, i32)
+
+declare <16 x i8> @llvm.aarch64.neon.vurshr.v16i8(<16 x i8>, i32)
+
+declare <8 x i16> @llvm.aarch64.neon.vurshr.v8i16(<8 x i16>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vurshr.v4i32(<4 x i32>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vurshr.v2i64(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8>, <8 x i8>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16>, <4 x i16>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32>, <2 x i32>, i32)
+
+declare <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8>, <16 x i8>, i32)
+
+declare <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16>, <8 x i16>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32>, <4 x i32>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64>, <2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32)
+
+declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32)
+
+declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsqshlu.v8i8(<8 x i8>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsqshlu.v4i16(<4 x i16>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsqshlu.v2i32(<2 x i32>, i32)
+
+declare <16 x i8> @llvm.aarch64.neon.vsqshlu.v16i8(<16 x i8>, i32)
+
+declare <8 x i16> @llvm.aarch64.neon.vsqshlu.v8i16(<8 x i16>, i32)
+
+declare <4 x i32> @llvm.aarch64.neon.vsqshlu.v4i32(<4 x i32>, i32)
+
+declare <2 x i64> @llvm.aarch64.neon.vsqshlu.v2i64(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
+
+declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
+
+declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
+
+declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
+
+declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
+
+declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
+
+declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
+
+declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
+
+declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) 
+
+declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) 
+
+declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) 
+
+declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
+
+declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
+
+declare <8 x i8> @llvm.aarch64.neon.vsqshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsqshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsqshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vrshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsqrshrun.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsqrshrun.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsqrshrun.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vuqshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vuqshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vuqshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vsqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vsqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vsqrshrn.v2i32(<2 x i64>, i32)
+
+declare <8 x i8> @llvm.aarch64.neon.vuqrshrn.v8i8(<8 x i16>, i32)
+
+declare <4 x i16> @llvm.aarch64.neon.vuqrshrn.v4i16(<4 x i32>, i32)
+
+declare <2 x i32> @llvm.aarch64.neon.vuqrshrn.v2i32(<2 x i64>, i32)
+
+declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.arm.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
+
+declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
+
+declare <2 x double> @llvm.arm.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
+
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.arm.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
+
+declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
+
+declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
+
+declare <2 x i64> @llvm.arm.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
+
+define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_s64_f64
+; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
+  ret <1 x i64> %1
+}
+
+define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
+; CHECK-LABEL: test_vcvt_n_u64_f64
+; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
+  ret <1 x i64> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_s64
+; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  ret <1 x double> %1
+}
+
+define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
+; CHECK-LABEL: test_vcvt_n_f64_u64
+; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
+  %1 = tail call <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  ret <1 x double> %1
+}
+
+declare <1 x i64> @llvm.arm.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
+declare <1 x i64> @llvm.arm.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
+declare <1 x double> @llvm.arm.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
+declare <1 x double> @llvm.arm.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
+\ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-simd-tbl.ll b/test/CodeGen/AArch64/neon-simd-tbl.ll
new file mode 100644
index 0000000..8eac1e8
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-tbl.ll
@@ -0,0 +1,828 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8>, <8 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
+
+declare <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8>, <16 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+
+declare <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>)
+
+define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtbl1_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl11.i
+}
+
+define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vqtbl1_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %vtbl1.i
+}
+
+define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl2_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
+  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl17.i
+}
+
+define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl2_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl2.i
+}
+
+define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl3_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl212.i
+}
+
+define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl3_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl3.i
+}
+
+define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl4_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl216.i
+}
+
+define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl4_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl4.i
+}
+
+define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vqtbl1q_s8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %vtbl1.i
+}
+
+define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl2q_s8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl2.i
+}
+
+define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl3q_s8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl3.i
+}
+
+define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl4q_s8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl4.i
+}
+
+define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vtbx1_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx2_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx17.i
+}
+
+define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx3_s8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx4_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx216.i
+}
+
+define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vqtbx1_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
+  ret <8 x i8> %vtbx1.i
+}
+
+define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx2_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx2.i
+}
+
+define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx3_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx3.i
+}
+
+define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx4_s8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx4.i
+}
+
+define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vqtbx1q_s8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %vtbx1.i
+}
+
+define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx2q_s8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx2.i
+}
+
+define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx3q_s8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx3.i
+}
+
+define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx4q_s8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx4.i
+}
+
+define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtbl1_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl11.i
+}
+
+define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vqtbl1_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %vtbl1.i
+}
+
+define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl2_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
+  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl17.i
+}
+
+define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl2_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl2.i
+}
+
+define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl3_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl212.i
+}
+
+define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl3_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl3.i
+}
+
+define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl4_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl216.i
+}
+
+define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl4_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl4.i
+}
+
+define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vqtbl1q_u8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %vtbl1.i
+}
+
+define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl2q_u8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl2.i
+}
+
+define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl3q_u8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl3.i
+}
+
+define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl4q_u8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl4.i
+}
+
+define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vtbx1_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx2_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx17.i
+}
+
+define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx3_u8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx4_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx216.i
+}
+
+define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vqtbx1_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
+  ret <8 x i8> %vtbx1.i
+}
+
+define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx2_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx2.i
+}
+
+define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx3_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx3.i
+}
+
+define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx4_u8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx4.i
+}
+
+define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vqtbx1q_u8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %vtbx1.i
+}
+
+define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx2q_u8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx2.i
+}
+
+define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx3q_u8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx3.i
+}
+
+define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx4q_u8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx4.i
+}
+
+define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vtbl1_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl11.i
+}
+
+define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) {
+; CHECK: test_vqtbl1_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %vtbl1.i
+}
+
+define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl2_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %a.coerce, 1
+  %vtbl1.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl17.i
+}
+
+define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl2_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl2.i
+}
+
+define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl3_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %a.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl212.i
+}
+
+define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl3_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl3.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl3.i
+}
+
+define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vtbl4_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %a.coerce, 3
+  %vtbl2.i = shufflevector <8 x i8> %__a.coerce.fca.0.extract.i, <8 x i8> %__a.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl215.i = shufflevector <8 x i8> %__a.coerce.fca.2.extract.i, <8 x i8> %__a.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl215.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl216.i
+}
+
+define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) {
+; CHECK: test_vqtbl4_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl4.v8i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <8 x i8> %b)
+  ret <8 x i8> %vtbl4.i
+}
+
+define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK: test_vqtbl1q_p8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbl1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %vtbl1.i
+}
+
+define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl2q_p8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %a.coerce, 1
+  %vtbl2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl2.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl2.i
+}
+
+define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl3q_p8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %a.coerce, 2
+  %vtbl3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl3.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl3.i
+}
+
+define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) {
+; CHECK: test_vqtbl4q_p8:
+; CHECK: tbl {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__a.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 0
+  %__a.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 1
+  %__a.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 2
+  %__a.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %a.coerce, 3
+  %vtbl4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbl4.v16i8.v16i8(<16 x i8> %__a.coerce.fca.0.extract.i, <16 x i8> %__a.coerce.fca.1.extract.i, <16 x i8> %__a.coerce.fca.2.extract.i, <16 x i8> %__a.coerce.fca.3.extract.i, <16 x i8> %b)
+  ret <16 x i8> %vtbl4.i
+}
+
+define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vtbx1_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbl1.i = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl11.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl1.v8i8.v16i8(<16 x i8> %vtbl1.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl11.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx2_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <8 x i8>] %b.coerce, 1
+  %vtbx1.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx17.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx1.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx17.i
+}
+
+define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx3_p8:
+; CHECK: tbl {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <8 x i8>] %b.coerce, 2
+  %vtbl2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl211.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbl212.i = tail call <8 x i8> @llvm.aarch64.neon.vtbl2.v8i8.v16i8(<16 x i8> %vtbl2.i, <16 x i8> %vtbl211.i, <8 x i8> %c)
+  %0 = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+  %1 = sext <8 x i1> %0 to <8 x i8>
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %1, <8 x i8> %a, <8 x i8> %vtbl212.i)
+  ret <8 x i8> %vbsl.i
+}
+
+define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vtbx4_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <8 x i8>] %b.coerce, 3
+  %vtbx2.i = shufflevector <8 x i8> %__b.coerce.fca.0.extract.i, <8 x i8> %__b.coerce.fca.1.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx215.i = shufflevector <8 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %__b.coerce.fca.3.extract.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vtbx216.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %vtbx2.i, <16 x i8> %vtbx215.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx216.i
+}
+
+define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) {
+; CHECK: test_vqtbx1_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %vtbx1.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx1.v8i8.v16i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c)
+  ret <8 x i8> %vtbx1.i
+}
+
+define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx2_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx2.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx2.i
+}
+
+define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx3_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx3.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx3.i
+}
+
+define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) {
+; CHECK: test_vqtbx4_p8:
+; CHECK: tbx {{v[0-9]+}}.8b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.8b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <8 x i8> @llvm.aarch64.neon.vtbx4.v8i8.v16i8(<8 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <8 x i8> %c)
+  ret <8 x i8> %vtbx4.i
+}
+
+define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK: test_vqtbx1q_p8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %vtbx1.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx1.v16i8.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
+  ret <16 x i8> %vtbx1.i
+}
+
+define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx2q_p8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [2 x <16 x i8>] %b.coerce, 1
+  %vtbx2.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx2.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx2.i
+}
+
+define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx3q_p8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [3 x <16 x i8>] %b.coerce, 2
+  %vtbx3.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx3.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx3.i
+}
+
+define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) {
+; CHECK: test_vqtbx4q_p8:
+; CHECK: tbx {{v[0-9]+}}.16b, {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, {{v[0-9]+}}.16b
+entry:
+  %__b.coerce.fca.0.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 0
+  %__b.coerce.fca.1.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 1
+  %__b.coerce.fca.2.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 2
+  %__b.coerce.fca.3.extract.i = extractvalue [4 x <16 x i8>] %b.coerce, 3
+  %vtbx4.i = tail call <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8.v16i8(<16 x i8> %a, <16 x i8> %__b.coerce.fca.0.extract.i, <16 x i8> %__b.coerce.fca.1.extract.i, <16 x i8> %__b.coerce.fca.2.extract.i, <16 x i8> %__b.coerce.fca.3.extract.i, <16 x i8> %c)
+  ret <16 x i8> %vtbx4.i
+}
+
diff --git a/test/CodeGen/AArch64/neon-simd-vget.ll b/test/CodeGen/AArch64/neon-simd-vget.ll
new file mode 100644
index 0000000..6474499
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-vget.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @test_vget_high_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_s8:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_s16:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_high_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_high_s32:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_s64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_s64:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vget_high_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_u8:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_u16:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_high_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_high_u32:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_u64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_u64:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <1 x i64> @test_vget_high_p64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_high_p64:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+  ret <1 x i64> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_f16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_f16:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x float> @test_vget_high_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vget_high_f32:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x float> %shuffle.i
+}
+
+define <8 x i8> @test_vget_high_p8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_high_p8:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_high_p16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_high_p16:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %shuffle.i
+}
+
+define <1 x double> @test_vget_high_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vget_high_f64:
+; CHECK: dup d0, {{v[0-9]+}}.d[1]
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> <i32 1>
+  ret <1 x double> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_s8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_s8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_s16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_s16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_low_s32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_low_s32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_s64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_s64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_u8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_u8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_u16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_u16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x i32> @test_vget_low_u32(<4 x i32> %a) {
+; CHECK-LABEL: test_vget_low_u32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x i32> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_u64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_u64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <1 x i64> @test_vget_low_p64(<2 x i64> %a) {
+; CHECK-LABEL: test_vget_low_p64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_f16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_f16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <2 x float> @test_vget_low_f32(<4 x float> %a) {
+; CHECK-LABEL: test_vget_low_f32:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuffle.i
+}
+
+define <8 x i8> @test_vget_low_p8(<16 x i8> %a) {
+; CHECK-LABEL: test_vget_low_p8:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %shuffle.i
+}
+
+define <4 x i16> @test_vget_low_p16(<8 x i16> %a) {
+; CHECK-LABEL: test_vget_low_p16:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i16> %shuffle.i
+}
+
+define <1 x double> @test_vget_low_f64(<2 x double> %a) {
+; CHECK-LABEL: test_vget_low_f64:
+; CHECK: ret
+entry:
+  %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %shuffle.i
+}
diff --git a/test/CodeGen/AArch64/regress-fp128-livein.ll b/test/CodeGen/AArch64/regress-fp128-livein.ll
new file mode 100644
index 0000000..cb8432a
--- /dev/null
+++ b/test/CodeGen/AArch64/regress-fp128-livein.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s
+
+; Regression test for NZCV reg live-in not being added to fp128csel IfTrue BB,
+; causing a crash during live range calc.
+define void @fp128_livein(i64 %a) {
+  %tobool = icmp ne i64 %a, 0
+  %conv = zext i1 %tobool to i32
+  %conv2 = sitofp i32 %conv to fp128
+  %conv6 = sitofp i32 %conv to double
+  %call3 = tail call i32 @g(fp128 %conv2)
+  %call8 = tail call i32 @h(double %conv6)
+  ret void
+}
+
+declare i32 @f()
+declare i32 @g(fp128)
+declare i32 @h(double)
diff --git a/test/CodeGen/AArch64/returnaddr.ll b/test/CodeGen/AArch64/returnaddr.ll
new file mode 100644
index 0000000..c85f9ec
--- /dev/null
+++ b/test/CodeGen/AArch64/returnaddr.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
+
+define i8* @rt0(i32 %x) nounwind readnone {
+entry:
+; CHECK-LABEL: rt0:
+; CHECK: mov x0, x30
+  %0 = tail call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @rt2() nounwind readnone {
+entry:
+; CHECK-LABEL: rt2:
+; CHECK: ldr x[[reg:[0-9]+]], [x29]
+; CHECK: ldr x[[reg]], [x[[reg]]]
+; CHECK: ldr x0, [x[[reg]], #8]
+  %0 = tail call i8* @llvm.returnaddress(i32 2)
+  ret i8* %0
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/AArch64/tls-dynamics.ll b/test/CodeGen/AArch64/tls-dynamics.ll
index 887d2f8..68c481c 100644
--- a/test/CodeGen/AArch64/tls-dynamics.ll
+++ b/test/CodeGen/AArch64/tls-dynamics.ll
@@ -10,8 +10,8 @@ define i32 @test_generaldynamic() {
   ret i32 %val
 
 ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
+; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
+; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
 ; CHECK: .tlsdesccall general_dynamic_var
 ; CHECK-NEXT: blr [[CALLEE]]
 
@@ -19,8 +19,8 @@ define i32 @test_generaldynamic() {
 ; CHECK: ldr w0, [x[[TP]], x0]
 
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
 
 }
@@ -31,8 +31,8 @@ define i32* @test_generaldynamic_addr() {
   ret i32* @general_dynamic_var
 
 ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
-; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
+; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var
+; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:general_dynamic_var]
 ; CHECK: .tlsdesccall general_dynamic_var
 ; CHECK-NEXT: blr [[CALLEE]]
 
@@ -40,8 +40,8 @@ define i32* @test_generaldynamic_addr() {
 ; CHECK: add x0, [[TP]], x0
 
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
 
 }
@@ -55,8 +55,8 @@ define i32 @test_localdynamic() {
   ret i32 %val
 
 ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
 ; CHECK: .tlsdesccall _TLS_MODULE_BASE_
 ; CHECK-NEXT: blr [[CALLEE]]
 
@@ -66,8 +66,8 @@ define i32 @test_localdynamic() {
 ; CHECK: ldr w0, [x0, [[DTP_OFFSET]]]
 
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
 
 }
@@ -78,8 +78,8 @@ define i32* @test_localdynamic_addr() {
   ret i32* @local_dynamic_var
 
 ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
 ; CHECK: .tlsdesccall _TLS_MODULE_BASE_
 ; CHECK-NEXT: blr [[CALLEE]]
 
@@ -89,8 +89,8 @@ define i32* @test_localdynamic_addr() {
 ; CHECK: add x0, x0, [[DTP_OFFSET]]
 
 ; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
-; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
-; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC-DAG: R_AARCH64_TLSDESC_LD64_LO12_NC
 ; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
 
 }
@@ -110,8 +110,8 @@ define i32 @test_localdynamic_deduplicate() {
   ret i32 %sum
 
 ; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
-; CHECK: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
-; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK-DAG: add x0, x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK-DAG: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], #:tlsdesc_lo12:_TLS_MODULE_BASE_]
 ; CHECK: .tlsdesccall _TLS_MODULE_BASE_
 ; CHECK-NEXT: blr [[CALLEE]]
 
diff --git a/test/CodeGen/AArch64/variadic.ll b/test/CodeGen/AArch64/variadic.ll
index 1c7e3bf..f3d376b 100644
--- a/test/CodeGen/AArch64/variadic.ll
+++ b/test/CodeGen/AArch64/variadic.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 < %s | FileCheck --check-prefix=CHECK-NOFP %s
 
 %va_list = type {i8*, i8*, i8*, i32, i32}
 
@@ -9,19 +10,28 @@ declare void @llvm.va_start(i8*)
 define void @test_simple(i32 %n, ...) {
 ; CHECK-LABEL: test_simple:
 ; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
 ; CHECK: mov x[[FPRBASE:[0-9]+]], sp
 ; CHECK: str q7, [x[[FPRBASE]], #112]
 ; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
 ; CHECK: str x7, [x[[GPRBASE]], #48]
 
+; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK-NOFP: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
+; CHECK-NOFP: str x7, [x[[GPRBASE]], #48]
+; CHECK-NOFP-NOT: str q7,
+; CHECK-NOFP: str x1, [sp, #[[GPRFROMSP]]]
+
 ; Omit the middle ones
 
 ; CHECK: str q0, [sp]
 ; CHECK: str x1, [sp, #[[GPRFROMSP]]]
 
+; CHECK-NOFP-NOT: str q0, [sp]
+
   %addr = bitcast %va_list* @var to i8*
   call void @llvm.va_start(i8* %addr)
-; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
 ; CHECK: movn [[VR_OFFS:w[0-9]+]], #127
 ; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
 ; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
@@ -33,6 +43,14 @@ define void @test_simple(i32 %n, ...) {
 ; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
 ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
 
+; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
+; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #55
+; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #56
+; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
+; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
+; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+
   ret void
 }
 
@@ -44,11 +62,19 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
 ; CHECK: add x[[GPRBASE:[0-9]+]], sp, #[[GPRFROMSP:[0-9]+]]
 ; CHECK: str x7, [x[[GPRBASE]], #32]
 
+; CHECK-NOFP: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK-NOFP-NOT: str q7,
+; CHECK-NOFP: mov x[[GPRBASE:[0-9]+]], sp
+; CHECK-NOFP: str x7, [x[[GPRBASE]], #24]
+
 ; Omit the middle ones
 
 ; CHECK: str q1, [sp]
 ; CHECK: str x3, [sp, #[[GPRFROMSP]]]
 
+; CHECK-NOFP-NOT: str q1, [sp]
+; CHECK-NOFP: str x4, [sp]
+
   %addr = bitcast %va_list* @var to i8*
   call void @llvm.va_start(i8* %addr)
 ; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
@@ -63,6 +89,15 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
 ; CHECK: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
 ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
 
+; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
+; CHECK-NOFP: movn [[GR_OFFS:w[0-9]+]], #31
+; CHECK-NOFP: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+; CHECK-NOFP: add [[GR_TOP:x[0-9]+]], x[[GPRBASE]], #32
+; CHECK-NOFP: str [[GR_TOP]], [x[[VA_LIST]], #8]
+; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #[[STACKSIZE]]
+; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+
   ret void
 }
 
@@ -75,6 +110,9 @@ define void @test_nospare([8 x i64], [8 x float], ...) {
 ; CHECK: mov [[STACK:x[0-9]+]], sp
 ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
 
+; CHECK-NOFP-NOT: sub sp, sp
+; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #64
+; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
   ret void
 }
 
@@ -87,6 +125,10 @@ define void @test_offsetstack([10 x i64], [3 x float], ...) {
 ; CHECK: str q7, [x[[FPRBASE]], #64]
 
 ; CHECK-NOT: str x{{[0-9]+}},
+
+; CHECK-NOFP-NOT: str q7,
+; CHECK-NOT: str x7,
+
 ; Omit the middle ones
 
 ; CHECK: str q3, [sp]
@@ -102,6 +144,11 @@ define void @test_offsetstack([10 x i64], [3 x float], ...) {
 ; CHECK: add [[STACK:x[0-9]+]], sp, #96
 ; CHECK: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
 
+; CHECK-NOFP: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK-NOFP: add [[STACK:x[0-9]+]], sp, #40
+; CHECK-NOFP: str [[STACK]], [{{x[0-9]+}}, #:lo12:var]
+; CHECK-NOFP: str wzr, [x[[VA_LIST]], #28]
+; CHECK-NOFP: str wzr, [x[[VA_LIST]], #24]
   ret void
 }
 
@@ -110,12 +157,14 @@ declare void @llvm.va_end(i8*)
 define void @test_va_end() nounwind {
 ; CHECK-LABEL: test_va_end:
 ; CHECK-NEXT: BB#0
+; CHECK-NOFP: BB#0
 
   %addr = bitcast %va_list* @var to i8*
   call void @llvm.va_end(i8* %addr)
 
   ret void
 ; CHECK-NEXT: ret
+; CHECK-NOFP-NEXT: ret
 }
 
 declare void @llvm.va_copy(i8* %dest, i8* %src)
@@ -131,14 +180,25 @@ define void @test_va_copy() {
 ; Check beginning and end again:
 
 ; CHECK: ldr [[BLOCK:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
+; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+; CHECK-NOFP: ldr [[BLOCK:x[0-9]+]], [{{x[0-9]+}}, #:lo12:var]
+; CHECK-NOFP: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
+
 ; CHECK: str [[BLOCK]], [{{x[0-9]+}}, #:lo12:second_list]
 
+; CHECK: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24]
 ; CHECK: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
-; CHECK: add x[[SRC_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:var
 
-; CHECK: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24]
 ; CHECK: str [[BLOCK]], [x[[DEST_LIST]], #24]
 
+; CHECK-NOFP: str [[BLOCK]], [{{x[0-9]+}}, #:lo12:second_list]
+
+; CHECK-NOFP: ldr [[BLOCK:x[0-9]+]], [x[[SRC_LIST]], #24]
+; CHECK-NOFP: add x[[DEST_LIST:[0-9]+]], {{x[0-9]+}}, #:lo12:second_list
+
+; CHECK-NOFP: str [[BLOCK]], [x[[DEST_LIST]], #24]
+
   ret void
 ; CHECK: ret
+; CHECK-NOFP: ret
 }
diff --git a/test/CodeGen/ARM/2009-10-16-Scope.ll b/test/CodeGen/ARM/2009-10-16-Scope.ll
index dd08b56..570fcf9 100644
--- a/test/CodeGen/ARM/2009-10-16-Scope.ll
+++ b/test/CodeGen/ARM/2009-10-16-Scope.ll
@@ -24,8 +24,7 @@ declare i32 @foo(i32) ssp
 
 !0 = metadata !{i32 5, i32 2, metadata !1, null}
 !1 = metadata !{i32 458763, null, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true,
-                i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0}; [DW_TAG_subprogram ]
+!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0}; [DW_TAG_subprogram ]
 !3 = metadata !{i32 458769, metadata !8, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, null, metadata !9, null, null, null, metadata !""}; [DW_TAG_compile_unit ]
 !4 = metadata !{i32 459008, metadata !5, metadata !"count_", metadata !3, i32 5, metadata !6}; [ DW_TAG_auto_variable ]
 !5 = metadata !{i32 458763, null, metadata !1, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
diff --git a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
index 89f468a..35739d7 100644
--- a/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
+++ b/test/CodeGen/ARM/2010-04-15-ScavengerDebugValue.ll
@@ -13,12 +13,13 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!15}
 !0 = metadata !{i32 524545, metadata !1, metadata !"b", metadata !2, i32 93, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 524334, metadata !12, null, metadata !"__addvsi3", metadata !"__addvsi3", metadata !"__addvsi3", i32 94, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 524329, metadata !12} ; [ DW_TAG_file_type ]
 !12 = metadata !{metadata !"libgcc2.c", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc"}
 !3 = metadata !{i32 524305, metadata !12, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", i1 true, metadata !"", i32 0, metadata !13, metadata !13, metadata !14, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 524309, metadata !12, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 524309, metadata !12, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !6, metadata !6}
 !6 = metadata !{i32 524310, metadata !12, null, metadata !"SItype", i32 152, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ]
 !7 = metadata !{i32 524329, metadata !"libgcc2.h", metadata !"/Users/bwilson/local/nightly/test-2010-04-14/build/llvmgcc.roots/llvmgcc~obj/src/gcc", metadata !3} ; [ DW_TAG_file_type ]
@@ -28,3 +29,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !11 = metadata !{i32 100, i32 0, metadata !10, null}
 !13 = metadata !{i32 0}
 !14 = metadata !{metadata !1}
+!15 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2010-08-04-StackVariable.ll b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
index f4ad4bc..7aacd1a 100644
--- a/test/CodeGen/ARM/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/ARM/2010-08-04-StackVariable.ll
@@ -75,9 +75,10 @@ return:                                           ; preds = %entry
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!49}
 
 !0 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786451, metadata !48, null, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_structure_type ]
+!1 = metadata !{i32 786451, metadata !48, null, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
 !2 = metadata !{i32 786473, metadata !48} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !48, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !47, metadata !47, metadata !46, metadata !47,  metadata !47, metadata !""} ; [ DW_TAG_compile_unit ]
 !4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9}
@@ -86,18 +87,18 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !7 = metadata !{i32 786445, metadata !48, metadata !1, metadata !"Kind", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ]
 !8 = metadata !{i32 786468, metadata !48, null, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
 !9 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 12, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!10 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{null, metadata !12, metadata !13}
 !12 = metadata !{i32 786447, metadata !48, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
 !13 = metadata !{i32 786468, metadata !48, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!14 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !12}
 !16 = metadata !{i32 786478, metadata !48, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"_ZN4SValC1Ev", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !17 = metadata !{i32 786478, metadata !48, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3fooi4SVal", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!18 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!18 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{metadata !13, metadata !13, metadata !1}
 !20 = metadata !{i32 786478, metadata !48, metadata !2, metadata !"main", metadata !"main", metadata !"main", i32 23, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!21 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!21 = metadata !{i32 786453, metadata !48, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{metadata !13}
 !23 = metadata !{i32 786689, metadata !17, metadata !"i", metadata !2, i32 16, metadata !13, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 16, i32 0, metadata !17, null}
@@ -125,3 +126,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !46 = metadata !{metadata !0, metadata !9, metadata !16, metadata !17, metadata !20}
 !47 = metadata !{i32 0}
 !48 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2010-09-29-mc-asm-header-test.ll b/test/CodeGen/ARM/2010-09-29-mc-asm-header-test.ll
index e6d1518..3053694 100644
--- a/test/CodeGen/ARM/2010-09-29-mc-asm-header-test.ll
+++ b/test/CodeGen/ARM/2010-09-29-mc-asm-header-test.ll
@@ -1,10 +1,79 @@
+; This tests that MC/asm header conversion is smooth and that the
+; build attributes are correct
+
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi | FileCheck %s --check-prefix=V6
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi | FileCheck %s --check-prefix=V6M
+; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s | FileCheck %s --check-prefix=ARM1156T2F-S
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi | FileCheck %s --check-prefix=V7M
 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=V7
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi | FileCheck %s --check-prefix=V8
 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi | FileCheck %s --check-prefix=Vt8
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=+v8fp | FileCheck %s --check-prefix=V8-V8FP
-; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=+neon | FileCheck %s --check-prefix=V8-NEON
-; This tests that MC/asm header conversion is smooth
-;
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-neon,-crypto | FileCheck %s --check-prefix=V8-FPARMv8
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-fp-armv8,-crypto | FileCheck %s --check-prefix=V8-NEON
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mattr=-crypto | FileCheck %s --check-prefix=V8-FPARMv8-NEON
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi | FileCheck %s --check-prefix=V8-FPARMv8-NEON-CRYPTO
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A9-HARD
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9-mp | FileCheck %s --check-prefix=CORTEX-A9-MP
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 | FileCheck %s --check-prefix=CORTEX-A15
+; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=CORTEX-M0
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-M4-SOFT
+; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-M4-HARD
+; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53
+; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=CORTEX-A57
+
+; V6:   .eabi_attribute 6, 6
+; V6:   .eabi_attribute 8, 1
+; V6:   .eabi_attribute 24, 1
+; V6:   .eabi_attribute 25, 1
+; V6-NOT:   .eabi_attribute 27
+; V6-NOT:   .eabi_attribute 28
+; V6-NOT:    .eabi_attribute 36
+; V6-NOT:    .eabi_attribute 42
+; V6-NOT:    .eabi_attribute 68
+
+; V6M:  .eabi_attribute 6, 12
+; V6M:  .eabi_attribute 7, 77
+; V6M:  .eabi_attribute 8, 0
+; V6M:  .eabi_attribute 9, 1
+; V6M:  .eabi_attribute 24, 1
+; V6M:  .eabi_attribute 25, 1
+; V6M-NOT:  .eabi_attribute 27
+; V6M-NOT:  .eabi_attribute 28
+; V6M-NOT:  .eabi_attribute 36
+; V6M-NOT:  .eabi_attribute 42
+; V6M-NOT:  .eabi_attribute 68
+
+; ARM1156T2F-S: .cpu arm1156t2f-s
+; ARM1156T2F-S: .eabi_attribute 6, 8
+; ARM1156T2F-S: .eabi_attribute 8, 1
+; ARM1156T2F-S: .eabi_attribute 9, 2
+; ARM1156T2F-S: .fpu vfpv2
+; ARM1156T2F-S: .eabi_attribute 20, 1
+; ARM1156T2F-S: .eabi_attribute 21, 1
+; ARM1156T2F-S: .eabi_attribute 23, 3
+; ARM1156T2F-S: .eabi_attribute 24, 1
+; ARM1156T2F-S: .eabi_attribute 25, 1
+; ARM1156T2F-S-NOT: .eabi_attribute 27
+; ARM1156T2F-S-NOT: .eabi_attribute 28
+; ARM1156T2F-S-NOT: .eabi_attribute 36
+; ARM1156T2F-S-NOT:    .eabi_attribute 42
+; ARM1156T2F-S-NOT:    .eabi_attribute 68
+
+; V7M:  .eabi_attribute 6, 10
+; V7M:  .eabi_attribute 7, 77
+; V7M:  .eabi_attribute 8, 0
+; V7M:  .eabi_attribute 9, 2
+; V7M:  .eabi_attribute 24, 1
+; V7M:  .eabi_attribute 25, 1
+; V7M-NOT:  .eabi_attribute 27
+; V7M-NOT:  .eabi_attribute 28
+; V7M-NOT:  .eabi_attribute 36
+; V7M-NOT:  .eabi_attribute 42
+; V7M:  .eabi_attribute 44, 0
+; V7M-NOT:  .eabi_attribute 68
+
 ; V7:      .syntax unified
 ; V7: .eabi_attribute 6, 10
 ; V7: .eabi_attribute 20, 1
@@ -12,6 +81,11 @@
 ; V7: .eabi_attribute 23, 3
 ; V7: .eabi_attribute 24, 1
 ; V7: .eabi_attribute 25, 1
+; V7-NOT: .eabi_attribute 27
+; V7-NOT: .eabi_attribute 28
+; V7-NOT: .eabi_attribute 36
+; V7-NOT:    .eabi_attribute 42
+; V7-NOT:    .eabi_attribute 68
 
 ; V8:      .syntax unified
 ; V8: .eabi_attribute 6, 14
@@ -19,14 +93,193 @@
 ; Vt8:     .syntax unified
 ; Vt8: .eabi_attribute 6, 14
 
-; V8-V8FP:      .syntax unified
-; V8-V8FP: .eabi_attribute 6, 14
-; V8-V8FP: .eabi_attribute 10, 7
+; V8-FPARMv8:      .syntax unified
+; V8-FPARMv8: .eabi_attribute 6, 14
+; V8-FPARMv8: .fpu fp-armv8
 
 ; V8-NEON:      .syntax unified
 ; V8-NEON: .eabi_attribute 6, 14
+; V8-NEON: .fpu neon
 ; V8-NEON: .eabi_attribute 12, 3
 
+; V8-FPARMv8-NEON:      .syntax unified
+; V8-FPARMv8-NEON: .eabi_attribute 6, 14
+; V8-FPARMv8-NEON: .fpu neon-fp-armv8
+; V8-FPARMv8-NEON: .eabi_attribute 12, 3
+
+; V8-FPARMv8-NEON-CRYPTO:      .syntax unified
+; V8-FPARMv8-NEON-CRYPTO: .eabi_attribute 6, 14
+; V8-FPARMv8-NEON-CRYPTO: .fpu crypto-neon-fp-armv8
+; V8-FPARMv8-NEON-CRYPTO: .eabi_attribute 12, 3
+
+; CORTEX-A9-SOFT:  .cpu cortex-a9
+; CORTEX-A9-SOFT:  .eabi_attribute 6, 10
+; CORTEX-A9-SOFT:  .eabi_attribute 7, 65
+; CORTEX-A9-SOFT:  .eabi_attribute 8, 1
+; CORTEX-A9-SOFT:  .eabi_attribute 9, 2
+; CORTEX-A9-SOFT:  .fpu neon
+; CORTEX-A9-SOFT:  .eabi_attribute 20, 1
+; CORTEX-A9-SOFT:  .eabi_attribute 21, 1
+; CORTEX-A9-SOFT:  .eabi_attribute 23, 3
+; CORTEX-A9-SOFT:  .eabi_attribute 24, 1
+; CORTEX-A9-SOFT:  .eabi_attribute 25, 1
+; CORTEX-A9-SOFT-NOT:  .eabi_attribute 27
+; CORTEX-A9-SOFT-NOT:  .eabi_attribute 28
+; CORTEX-A9-SOFT:  .eabi_attribute 36, 1
+; CORTEX-A9-SOFT-NOT:  .eabi_attribute 42
+; CORTEX-A9-SOFT:  .eabi_attribute 68, 1
+
+; CORTEX-A9-HARD:  .cpu cortex-a9
+; CORTEX-A9-HARD:  .eabi_attribute 6, 10
+; CORTEX-A9-HARD:  .eabi_attribute 7, 65
+; CORTEX-A9-HARD:  .eabi_attribute 8, 1
+; CORTEX-A9-HARD:  .eabi_attribute 9, 2
+; CORTEX-A9-HARD:  .fpu neon
+; CORTEX-A9-HARD:  .eabi_attribute 20, 1
+; CORTEX-A9-HARD:  .eabi_attribute 21, 1
+; CORTEX-A9-HARD:  .eabi_attribute 23, 3
+; CORTEX-A9-HARD:  .eabi_attribute 24, 1
+; CORTEX-A9-HARD:  .eabi_attribute 25, 1
+; CORTEX-A9-HARD-NOT:  .eabi_attribute 27
+; CORTEX-A9-HARD:  .eabi_attribute 28, 1
+; CORTEX-A9-HARD:  .eabi_attribute 36, 1
+; CORTEX-A9-HARD-NOT:  .eabi_attribute 42
+; CORTEX-A9-HARD:  .eabi_attribute 68, 1
+
+; CORTEX-A9-MP:  .cpu cortex-a9-mp
+; CORTEX-A9-MP:  .eabi_attribute 6, 10
+; CORTEX-A9-MP:  .eabi_attribute 7, 65
+; CORTEX-A9-MP:  .eabi_attribute 8, 1
+; CORTEX-A9-MP:  .eabi_attribute 9, 2
+; CORTEX-A9-MP:  .fpu neon
+; CORTEX-A9-MP:  .eabi_attribute 20, 1
+; CORTEX-A9-MP:  .eabi_attribute 21, 1
+; CORTEX-A9-MP:  .eabi_attribute 23, 3
+; CORTEX-A9-MP:  .eabi_attribute 24, 1
+; CORTEX-A9-MP:  .eabi_attribute 25, 1
+; CORTEX-A9-NOT:  .eabi_attribute 27
+; CORTEX-A9-NOT:  .eabi_attribute 28
+; CORTEX-A9-MP:  .eabi_attribute 36, 1
+; CORTEX-A9-MP:  .eabi_attribute 42, 1
+; CORTEX-A9-MP:  .eabi_attribute 68, 1
+
+; CORTEX-A15: .cpu cortex-a15
+; CORTEX-A15: .eabi_attribute 6, 10
+; CORTEX-A15: .eabi_attribute 7, 65
+; CORTEX-A15: .eabi_attribute 8, 1
+; CORTEX-A15: .eabi_attribute 9, 2
+; CORTEX-A15: .fpu neon-vfpv4
+; CORTEX-A15: .eabi_attribute 20, 1
+; CORTEX-A15: .eabi_attribute 21, 1
+; CORTEX-A15: .eabi_attribute 23, 3
+; CORTEX-A15: .eabi_attribute 24, 1
+; CORTEX-A15: .eabi_attribute 25, 1
+; CORTEX-A15-NOT: .eabi_attribute 27
+; CORTEX-A15-NOT: .eabi_attribute 28
+; CORTEX-A15: .eabi_attribute 36, 1
+; CORTEX-A15: .eabi_attribute 42, 1
+; CORTEX-A15: .eabi_attribute 44, 2
+; CORTEX-A15: .eabi_attribute 68, 3
+
+; CORTEX-M0:  .cpu cortex-m0
+; CORTEX-M0:  .eabi_attribute 6, 12
+; CORTEX-M0:  .eabi_attribute 7, 77
+; CORTEX-M0:  .eabi_attribute 8, 0
+; CORTEX-M0:  .eabi_attribute 9, 1
+; CORTEX-M0:  .eabi_attribute 24, 1
+; CORTEX-M0:  .eabi_attribute 25, 1
+; CORTEX-M0-NOT:  .eabi_attribute 27
+; CORTEX-M0-NOT:  .eabi_attribute 28
+; CORTEX-M0-NOT:  .eabi_attribute 36
+; CORTEX-M0-NOT:  .eabi_attribute 42
+; CORTEX-M0-NOT:  .eabi_attribute 68
+
+; CORTEX-M4-SOFT:  .cpu cortex-m4
+; CORTEX-M4-SOFT:  .eabi_attribute 6, 13
+; CORTEX-M4-SOFT:  .eabi_attribute 7, 77
+; CORTEX-M4-SOFT:  .eabi_attribute 8, 0
+; CORTEX-M4-SOFT:  .eabi_attribute 9, 2
+; CORTEX-M4-SOFT:  .fpu vfpv4-d16
+; CORTEX-M4-SOFT:  .eabi_attribute 20, 1
+; CORTEX-M4-SOFT:  .eabi_attribute 21, 1
+; CORTEX-M4-SOFT:  .eabi_attribute 23, 3
+; CORTEX-M4-SOFT:  .eabi_attribute 24, 1
+; CORTEX-M4-SOFT:  .eabi_attribute 25, 1
+; CORTEX-M4-SOFT:  .eabi_attribute 27, 1
+; CORTEX-M4-SOFT-NOT:  .eabi_attribute 28
+; CORTEX-M4-SOFT:  .eabi_attribute 36, 1
+; CORTEX-M4-SOFT-NOT:  .eabi_attribute 42
+; CORTEX-M4-SOFT:  .eabi_attribute 44, 0
+; CORTEX-M4-SOFT-NOT:  .eabi_attribute 68
+
+; CORTEX-M4-HARD:  .cpu cortex-m4
+; CORTEX-M4-HARD:  .eabi_attribute 6, 13
+; CORTEX-M4-HARD:  .eabi_attribute 7, 77
+; CORTEX-M4-HARD:  .eabi_attribute 8, 0
+; CORTEX-M4-HARD:  .eabi_attribute 9, 2
+; CORTEX-M4-HARD:  .fpu vfpv4-d16
+; CORTEX-M4-HARD:  .eabi_attribute 20, 1
+; CORTEX-M4-HARD:  .eabi_attribute 21, 1
+; CORTEX-M4-HARD:  .eabi_attribute 23, 3
+; CORTEX-M4-HARD:  .eabi_attribute 24, 1
+; CORTEX-M4-HARD:  .eabi_attribute 25, 1
+; CORTEX-M4-HARD:  .eabi_attribute 27, 1
+; CORTEX-M4-HARD:  .eabi_attribute 28, 1
+; CORTEX-M4-HARD:  .eabi_attribute 36, 1
+; CORTEX-M4-HARD-NOT:  .eabi_attribute 42
+; CORTEX-M4-HARD:  .eabi_attribute 44, 0
+; CORTEX-M4-HRAD-NOT:  .eabi_attribute 68
+
+; CORTEX-R5:  .cpu cortex-r5
+; CORTEX-R5:  .eabi_attribute 6, 10
+; CORTEX-R5:  .eabi_attribute 7, 82
+; CORTEX-R5:  .eabi_attribute 8, 1
+; CORTEX-R5:  .eabi_attribute 9, 2
+; CORTEX-R5:  .fpu vfpv3-d16
+; CORTEX-R5:  .eabi_attribute 20, 1
+; CORTEX-R5:  .eabi_attribute 21, 1
+; CORTEX-R5:  .eabi_attribute 23, 3
+; CORTEX-R5:  .eabi_attribute 24, 1
+; CORTEX-R5:  .eabi_attribute 25, 1
+; CORTEX-R5:  .eabi_attribute 27, 1
+; CORTEX-R5-NOT:  .eabi_attribute 28
+; CORTEX-R5-NOT:  .eabi_attribute 36
+; CORTEX-R5-NOT:  .eabi_attribute 42
+; CORTEX-R5:  .eabi_attribute 44, 2
+; CORTEX-R5-NOT:  .eabi_attribute 68
+
+; CORTEX-A53:  .cpu cortex-a53
+; CORTEX-A53:  .eabi_attribute 6, 14
+; CORTEX-A53:  .eabi_attribute 7, 65
+; CORTEX-A53:  .eabi_attribute 8, 1
+; CORTEX-A53:  .eabi_attribute 9, 2
+; CORTEX-A53:  .fpu crypto-neon-fp-armv8
+; CORTEX-A53:  .eabi_attribute 12, 3
+; CORTEX-A53:  .eabi_attribute 24, 1
+; CORTEX-A53:  .eabi_attribute 25, 1
+; CORTEX-A53-NOT:  .eabi_attribute 27
+; CORTEX-A53-NOT:  .eabi_attribute 28
+; CORTEX-A53:  .eabi_attribute 36, 1
+; CORTEX-A53:  .eabi_attribute 42, 1
+; CORTEX-A53:  .eabi_attribute 44, 2
+; CORTEX-A53:  .eabi_attribute 68, 3
+
+; CORTEX-A57:  .cpu cortex-a57
+; CORTEX-A57:  .eabi_attribute 6, 14
+; CORTEX-A57:  .eabi_attribute 7, 65
+; CORTEX-A57:  .eabi_attribute 8, 1
+; CORTEX-A57:  .eabi_attribute 9, 2
+; CORTEX-A57:  .fpu crypto-neon-fp-armv8
+; CORTEX-A57:  .eabi_attribute 12, 3
+; CORTEX-A57:  .eabi_attribute 24, 1
+; CORTEX-A57:  .eabi_attribute 25, 1
+; CORTEX-A57-NOT:  .eabi_attribute 27
+; CORTEX-A57-NOT:  .eabi_attribute 28
+; CORTEX-A57:  .eabi_attribute 36, 1
+; CORTEX-A57:  .eabi_attribute 42, 1
+; CORTEX-A57:  .eabi_attribute 44, 2
+; CORTEX-A57:  .eabi_attribute 68, 3
+
 define i32 @f(i64 %z) {
 	ret i32 0
 }
diff --git a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll b/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
deleted file mode 100644
index d19adcc..0000000
--- a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc  %s -mtriple=arm-linux-gnueabi -filetype=obj -o - | \
-; RUN:    llvm-readobj -s -sd | FileCheck  -check-prefix=BASIC %s 
-; RUN: llc  %s -mtriple=armv7-linux-gnueabi -march=arm -mcpu=cortex-a8 \
-; RUN:    -mattr=-neon,-vfp3,+vfp2 \
-; RUN:    -arm-reserve-r9 -filetype=obj -o - | \
-; RUN:    llvm-readobj -s -sd | FileCheck  -check-prefix=CORTEXA8 %s
-
-
-; This tests that the extpected ARM attributes are emitted.
-;
-; BASIC:        Section {
-; BASIC:          Name: .ARM.attributes
-; BASIC-NEXT:     Type: SHT_ARM_ATTRIBUTES
-; BASIC-NEXT:     Flags [ (0x0)
-; BASIC-NEXT:     ]
-; BASIC-NEXT:     Address: 0x0
-; BASIC-NEXT:     Offset: 0x3C
-; BASIC-NEXT:     Size: 28
-; BASIC-NEXT:     Link: 0
-; BASIC-NEXT:     Info: 0
-; BASIC-NEXT:     AddressAlignment: 1
-; BASIC-NEXT:     EntrySize: 0
-; BASIC-NEXT:     SectionData (
-; BASIC-NEXT:       0000: 411B0000 00616561 62690001 11000000
-; BASIC-NEXT:       0010: 06011401 15011703 18011901
-; BASIC-NEXT:     )
-
-; CORTEXA8:        Name: .ARM.attributes
-; CORTEXA8-NEXT:     Type: SHT_ARM_ATTRIBUTES
-; CORTEXA8-NEXT:     Flags [ (0x0)
-; CORTEXA8-NEXT:     ]
-; CORTEXA8-NEXT:     Address: 0x0
-; CORTEXA8-NEXT:     Offset: 0x3C
-; CORTEXA8-NEXT:     Size: 47
-; CORTEXA8-NEXT:     Link: 0
-; CORTEXA8-NEXT:     Info: 0
-; CORTEXA8-NEXT:     AddressAlignment: 1
-; CORTEXA8-NEXT:     EntrySize: 0
-; CORTEXA8-NEXT:     SectionData (
-; CORTEXA8-NEXT:       0000: 412E0000 00616561 62690001 24000000
-; CORTEXA8-NEXT:       0010: 05434F52 5445582D 41380006 0A074108
-; CORTEXA8-NEXT:       0020: 0109020A 02140115 01170318 011901
-; CORTEXA8-NEXT:     )
-
-define i32 @f(i64 %z) {
-       ret i32 0
-}
diff --git a/test/CodeGen/ARM/2010-11-30-reloc-movt.ll b/test/CodeGen/ARM/2010-11-30-reloc-movt.ll
deleted file mode 100644
index 6bea7b8..0000000
--- a/test/CodeGen/ARM/2010-11-30-reloc-movt.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc  %s -mtriple=armv7-linux-gnueabi -filetype=obj -o - | \
-; RUN:    llvm-readobj -s -sr -sd | FileCheck  -check-prefix=OBJ %s
-
-target triple = "armv7-none-linux-gnueabi"
-
-@a = external global i8
-
-define arm_aapcs_vfpcc i32 @barf() nounwind {
-entry:
-  %0 = tail call arm_aapcs_vfpcc  i32 @foo(i8* @a) nounwind
-  ret i32 %0
-; OBJ:        Section {
-; OBJ:          Name: .text
-; OBJ:          SectionData (
-; OBJ-NEXT:       0000: 00482DE9 000000E3 000040E3 FEFFFFEB
-; OBJ-NEXT:       0010: 0088BDE8
-; OBJ-NEXT:     )
-; OBJ:          Relocations [
-; OBJ-NEXT:       0x4 R_ARM_MOVW_ABS_NC a
-; OBJ-NEXT:       0x8 R_ARM_MOVT_ABS
-; OBJ-NEXT:       0xC R_ARM_CALL foo
-; OBJ-NEXT:     ]
-
-}
-
-declare arm_aapcs_vfpcc i32 @foo(i8*)
-
diff --git a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
index 626d121..f57411b 100644
--- a/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-01-19-MergedGlobalDbg.ll
@@ -76,11 +76,12 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!49}
 
 !0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get1", metadata !"get1", metadata !"get1", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get1, null, null, metadata !42, i32 4} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !47, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2369.8)", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !41, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !47, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !47, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !5}
 !5 = metadata !{i32 786468, metadata !47, metadata !1, metadata !"_Bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"get2", metadata !"get2", metadata !"get2", i32 7, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i8 (i8)* @get2, null, null, metadata !43, i32 7} ; [ DW_TAG_subprogram ]
@@ -126,3 +127,4 @@ entry:
 !46 = metadata !{metadata !27, metadata !28}
 !47 = metadata !{metadata !"foo.c", metadata !"/tmp/"}
 !48 = metadata !{i32 0}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll b/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
index f689d49..bc72e12 100644
--- a/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
+++ b/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
@@ -21,8 +21,8 @@ for.body:                                         ; preds = %_Z14printIsNotZeroi
   %x = getelementptr %struct.Outer* @oStruct, i32 0, i32 1, i32 %i.022, i32 0
   %y = getelementptr %struct.Outer* @oStruct, i32 0, i32 1, i32 %i.022, i32 1
   %inc = add i32 %i.022, 1
-  %tmp8 = load i32* %x, align 4, !tbaa !0
-  %tmp11 = load i32* %y, align 4, !tbaa !0
+  %tmp8 = load i32* %x, align 4
+  %tmp11 = load i32* %y, align 4
   %mul = mul nsw i32 %tmp11, %tmp8
   %tobool.i14 = icmp eq i32 %mul, 0
   br i1 %tobool.i14, label %_Z14printIsNotZeroi.exit17, label %if.then.i16
@@ -35,15 +35,10 @@ _Z14printIsNotZeroi.exit17:                       ; preds = %_Z14printIsNotZeroi
 
 _Z14printIsNotZeroi.exit17.for.body_crit_edge:    ; preds = %_Z14printIsNotZeroi.exit17
   %b.phi.trans.insert = getelementptr %struct.Outer* @oStruct, i32 0, i32 1, i32 %inc, i32 3
-  %tmp3.pre = load i8* %b.phi.trans.insert, align 1, !tbaa !3
+  %tmp3.pre = load i8* %b.phi.trans.insert, align 1
   %phitmp27 = icmp eq i8 undef, 0
   br label %for.body
 
 for.end:                                          ; preds = %_Z14printIsNotZeroi.exit17
   ret void
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"bool", metadata !1}
diff --git a/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll b/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll
index 348ec9f..e30c9c6 100644
--- a/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll
+++ b/test/CodeGen/ARM/2011-04-11-MachineLICMBug.ll
@@ -15,15 +15,14 @@ for.cond:
 
 for.body:
 ; CHECK: %for.
-; CHECK: movs r{{[0-9]+}}, #{{[01]}}
+; CHECK: mov{{.*}} r{{[0-9]+}}, #{{[01]}}
+; CHECK: mov{{.*}} r{{[0-9]+}}, #{{[01]}}
+; CHECK-NOT: mov r{{[0-9]+}}, #{{[01]}}
   %arrayidx = getelementptr i32* %A, i32 %0
   %tmp4 = load i32* %arrayidx, align 4
   %cmp6 = icmp eq i32 %tmp4, %value
   br i1 %cmp6, label %return, label %for.inc
 
-; CHECK: %for.
-; CHECK: movs r{{[0-9]+}}, #{{[01]}}
-
 for.inc:
   %inc = add i32 %0, 1
   br label %for.cond
diff --git a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
index 33826f8..bb78707 100644
--- a/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
+++ b/test/CodeGen/ARM/2011-08-02-MergedGlobalDbg.ll
@@ -73,17 +73,18 @@ define i32 @get5(i32 %a) nounwind optsize ssp {
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!49}
 
 !0 = metadata !{i32 786449, metadata !47, i32 12, metadata !"clang", i1 true, metadata !"", i32 0, metadata !48, metadata !48, metadata !40, metadata !41,  metadata !41, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get1", metadata !"get1", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @get1, null, null, metadata !42, i32 5} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get1", metadata !"get1", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get1, null, null, metadata !42, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [get1]
 !2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get2", metadata !"get2", metadata !"", i32 8, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @get2, null, null, metadata !43, i32 8} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get3", metadata !"get3", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @get3, null, null, metadata !44, i32 11} ; [ DW_TAG_subprogram ]
-!8 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get4", metadata !"get4", metadata !"", i32 14, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @get4, null, null, metadata !45, i32 14} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get5", metadata !"get5", metadata !"", i32 17, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @get5, null, null, metadata !46, i32 17} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get2", metadata !"get2", metadata !"", i32 8, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get2, null, null, metadata !43, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [get2]
+!7 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get3", metadata !"get3", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get3, null, null, metadata !44, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [get3]
+!8 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get4", metadata !"get4", metadata !"", i32 14, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get4, null, null, metadata !45, i32 14} ; [ DW_TAG_subprogram ] [line 14] [def] [get4]
+!9 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"get5", metadata !"get5", metadata !"", i32 17, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32)* @get5, null, null, metadata !46, i32 17} ; [ DW_TAG_subprogram ] [line 17] [def] [get5]
 !10 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 16777221, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
 !11 = metadata !{i32 786688, metadata !12, metadata !"b", metadata !2, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
 !12 = metadata !{i32 786443, metadata !47, metadata !1, i32 5, i32 19, i32 0} ; [ DW_TAG_lexical_block ]
@@ -123,3 +124,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !46 = metadata !{metadata !27, metadata !28}
 !47 = metadata !{metadata !"ss3.c", metadata !"/private/tmp"}
 !48 = metadata !{i32 0}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll b/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll
index 91de08a..9163166 100644
--- a/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll
+++ b/test/CodeGen/ARM/2011-08-25-ldmia_ret.ll
@@ -42,7 +42,7 @@ if.then:                                          ; preds = %land.lhs.true
 ; If-convert the return
 ; CHECK: it	ne
 ; Fold the CSR+return into a pop
-; CHECK: pop {r4, r5, r6, r7, pc}
+; CHECK: pop {r4, r5, r7, pc}
 sw.bb18:
   %call20 = tail call i32 @bar(i32 %in2) nounwind
   switch i32 %call20, label %sw.default56 [
diff --git a/test/CodeGen/ARM/2011-10-26-memset-inline.ll b/test/CodeGen/ARM/2011-10-26-memset-inline.ll
index ff049c8..03614ed 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-inline.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-inline.ll
@@ -10,8 +10,8 @@ target triple = "thumbv7-apple-ios5.0.0"
 ; CHECK-GENERIT-NEXT: strb
 ; CHECK-GENERIT-NEXT: strb
 ; CHECK-GENERIT-NEXT: strb
-; CHECK-UNALIGNED:      strb
-; CHECK-UNALIGNED-NEXT: str 
+; CHECK-UNALIGNED:    strb
+; CHECK-UNALIGNED:    str
 define void @foo(i8* nocapture %c) nounwind optsize {
 entry:
   call void @llvm.memset.p0i8.i64(i8* %c, i8 -1, i64 5, i32 1, i1 false)
diff --git a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
index f563eee..850c511 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s
 
 ; Trigger multiple NEON stores.
-; CHECK:      vst1.64
-; CHECK-NEXT: vst1.64
+; CHECK: vst1.64
+; CHECK: vst1.64
 define void @f_0_40(i8* nocapture %c) nounwind optsize {
 entry:
   call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 40, i32 16, i1 false)
diff --git a/test/CodeGen/ARM/2012-08-30-select.ll b/test/CodeGen/ARM/2012-08-30-select.ll
index 2fd8df4..e78bbde 100644
--- a/test/CodeGen/ARM/2012-08-30-select.ll
+++ b/test/CodeGen/ARM/2012-08-30-select.ll
@@ -5,14 +5,11 @@
 ;CHECK: it  ne
 ;CHECK-NEXT: vmovne.i32
 ;CHECK: bx
-define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) {
+define <16 x i8> @select_s_v_v(<16 x i8> %vec, i32 %avail) {
 entry:
-  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
   %and = and i32 %avail, 1
   %tobool = icmp eq i32 %and, 0
-  %vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer
-  ret <16 x i8> %vld1.
+  %ret = select i1 %tobool, <16 x i8> %vec, <16 x i8> zeroinitializer
+  ret <16 x i8> %ret
 }
 
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
-
diff --git a/test/CodeGen/ARM/2013-02-27-expand-vfma.ll b/test/CodeGen/ARM/2013-02-27-expand-vfma.ll
index 135b144..f812118 100644
--- a/test/CodeGen/ARM/2013-02-27-expand-vfma.ll
+++ b/test/CodeGen/ARM/2013-02-27-expand-vfma.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=armv7s-apple-darwin | FileCheck %s -check-prefix=VFP4
+; RUN: llc < %s -mtriple=armv7s-apple-darwin | FileCheck %s -check-prefix=CHECK-VFP4
 
 define <4 x float> @muladd(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind {
 ; CHECK-LABEL: muladd:
diff --git a/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll b/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
index 2eeebac..c4f5f54 100644
--- a/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
+++ b/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck -check-prefix=CHECK-V8 %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck -check-prefix=CHECK-V8 %s
 ; rdar://13782395
 
 define i32 @t1(i32 %a, i32 %b, i8** %retaddr) {
@@ -81,7 +83,7 @@ KBBlockZero.exit:                                 ; preds = %bb2.i
 ; <rdar://problem/14379453>
 
 ; Hard-coded registers comes from the ABI.
-; CHECK: wrapDistance:
+; CHECK-LABEL: wrapDistance:
 ; CHECK: cmp r1, #59
 ; CHECK-NEXT: itt le
 ; CHECK-NEXT: suble r0, r2, #1
@@ -100,6 +102,27 @@ KBBlockZero.exit:                                 ; preds = %bb2.i
 ; CHECK: [[LABEL]]:
 ; CHECK-NEXT: subs r0, r1, r0
 ; CHECK-NEXT: bx lr
+
+; CHECK-V8-LABEL: wrapDistance:
+; CHECK-V8: cmp r1, #59
+; CHECK-V8-NEXT: bgt
+; CHECK-V8-NEXT: %if.then
+; CHECK-V8-NEXT: subs r0, r2, #1
+; CHECK-V8-NEXT: bx lr
+; CHECK-V8-NEXT: %if.else
+; CHECK-V8-NEXT: subs [[REG:r[0-9]+]], #120
+; CHECK-V8-NEXT: cmp [[REG]], r1
+; CHECK-V8-NEXT: bge
+; CHECK-V8-NEXT: %if.else
+; CHECK-V8-NEXT: cmp r0, #119
+; CHECK-V8-NEXT: bgt
+; CHECK-V8-NEXT: %if.then4
+; CHECK-V8-NEXT: adds r0, r1, #1
+; CHECK-V8-NEXT: bx lr
+; CHECK-V8-NEXT: %if.end5
+; CHECK-V8-NEXT: subs r0, r1, r0
+; CHECK-V8-NEXT: bx lr
+
 define i32 @wrapDistance(i32 %tx, i32 %sx, i32 %w) {
 entry:
   %cmp = icmp slt i32 %sx, 60
diff --git a/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll b/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll
new file mode 100644
index 0000000..defb946
--- /dev/null
+++ b/test/CodeGen/ARM/2013-05-07-ByteLoadSameAddress.ll
@@ -0,0 +1,64 @@
+; RUN: llc < %s -march=thumb -mattr=+v7,+thumb2 | FileCheck %s
+
+define i8 @f1(i8* %call1, i8* %call3, i32 %h, i32 %w, i32 %Width) {
+; CHECK: f1:
+entry:
+        %mul17 = mul nsw i32 %Width, %h
+        %add = add nsw i32 %mul17, %w
+        %sub19 = sub i32 %add, %Width
+        %sub20 = add i32 %sub19, -1
+        %arrayidx21 = getelementptr inbounds i8* %call1, i32 %sub20
+        %0 = load i8* %arrayidx21, align 1
+        %conv22 = zext i8 %0 to i32
+        %arrayidx25 = getelementptr inbounds i8* %call1, i32 %sub19
+        %1 = load i8* %arrayidx25, align 1
+        %conv26 = zext i8 %1 to i32
+        %mul23189 = add i32 %conv26, %conv22
+        %add30 = add i32 %sub19, 1
+        %arrayidx31 = getelementptr inbounds i8* %call1, i32 %add30
+        %2 = load i8* %arrayidx31, align 1
+        %conv32 = zext i8 %2 to i32
+; CHECK: ldrb r{{[0-9]*}}, [r{{[0-9]*}}, #-1]
+; CHECK-NEXT: ldrb r{{[0-9]*}}, [r{{[0-9]*}}, #1]
+        %add28190 = add i32 %mul23189, %conv32
+        %sub35 = add i32 %add, -1
+        %arrayidx36 = getelementptr inbounds i8* %call1, i32 %sub35
+        %3 = load i8* %arrayidx36, align 1
+        %conv37 = zext i8 %3 to i32
+        %add34191 = add i32 %add28190, %conv37
+        %arrayidx40 = getelementptr inbounds i8* %call1, i32 %add
+        %4 = load i8* %arrayidx40, align 1
+        %conv41 = zext i8 %4 to i32
+        %mul42 = mul nsw i32 %conv41, 255
+        %add44 = add i32 %add, 1
+        %arrayidx45 = getelementptr inbounds i8* %call1, i32 %add44
+        %5 = load i8* %arrayidx45, align 1
+        %conv46 = zext i8 %5 to i32
+; CHECK: ldrb r{{[0-9]*}}, [r{{[0-9]*}}, #-1]
+; CHECK-NEXT: ldrb r{{[0-9]*}}, [r{{[0-9]*}}, #1]
+        %add49 = add i32 %add, %Width
+        %sub50 = add i32 %add49, -1
+        %arrayidx51 = getelementptr inbounds i8* %call1, i32 %sub50
+        %6 = load i8* %arrayidx51, align 1
+        %conv52 = zext i8 %6 to i32
+        %arrayidx56 = getelementptr inbounds i8* %call1, i32 %add49
+        %7 = load i8* %arrayidx56, align 1
+        %conv57 = zext i8 %7 to i32
+        %add61 = add i32 %add49, 1
+        %arrayidx62 = getelementptr inbounds i8* %call1, i32 %add61
+        %8 = load i8* %arrayidx62, align 1
+        %conv63 = zext i8 %8 to i32
+; CHECK: ldrb r{{[0-9]*}}, [r{{[0-9]*}}, #-1]
+; CHECK-NEXT: ldrb{{[.w]*}} r{{[0-9]*}}, [r{{[0-9]*}}, #1]
+        %tmp = add i32 %add34191, %conv46
+        %tmp193 = add i32 %tmp, %conv52
+        %tmp194 = add i32 %tmp193, %conv57
+        %tmp195 = add i32 %tmp194, %conv63
+        %tmp196 = mul i32 %tmp195, -28
+        %add65 = add i32 %tmp196, %mul42
+        %9 = lshr i32 %add65, 8
+        %conv68 = trunc i32 %9 to i8
+        %arrayidx69 = getelementptr inbounds i8* %call3, i32 %add
+        store i8 %conv68, i8* %arrayidx69, align 1
+        ret i8 %conv68
+}
diff --git a/test/CodeGen/ARM/2013-10-11-select-stalls.ll b/test/CodeGen/ARM/2013-10-11-select-stalls.ll
new file mode 100644
index 0000000..33c0587
--- /dev/null
+++ b/test/CodeGen/ARM/2013-10-11-select-stalls.ll
@@ -0,0 +1,16 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -stats 2>&1 | not grep "Number of pipeline stalls"
+; Evaluate the two vld1.8 instructions in separate MBB's,
+; instead of stalling on one and conditionally overwriting its result.
+
+define <16 x i8> @multiselect(i32 %avail, i8* %foo, i8* %bar) {
+entry:
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %foo, i32 1)
+  %vld2 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %and = and i32 %avail, 1
+  %tobool = icmp eq i32 %and, 0
+  %retv = select i1 %tobool, <16 x i8> %vld1, <16 x i8> %vld2
+  ret <16 x i8> %retv
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
diff --git a/test/CodeGen/ARM/2013-11-08-inline-asm-neon-array.ll b/test/CodeGen/ARM/2013-11-08-inline-asm-neon-array.ll
new file mode 100644
index 0000000..5a86477
--- /dev/null
+++ b/test/CodeGen/ARM/2013-11-08-inline-asm-neon-array.ll
@@ -0,0 +1,16 @@
+;RUN:  not llc -mtriple=arm-linux-gnueabihf < %s 2>&1 | FileCheck %s
+
+; ModuleID = 'bug.c'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7--"
+
+%struct.uint8x8x4_t = type { [4 x <8 x i8>] }
+
+define void @foo() #0 {
+  %vsrc = alloca %struct.uint8x8x4_t, align 8
+  %ptr = alloca i8;
+  %1 = call i8* asm sideeffect "vld4.u8 ${0:h}, [$1], $2", "=*w,=r,r,1"(%struct.uint8x8x4_t* %vsrc, i32 0, i8* %ptr)
+  ret void
+}
+
+; CHECK: error: couldn't allocate output register for constraint 'w'
diff --git a/test/CodeGen/ARM/a15-SD-dep.ll b/test/CodeGen/ARM/a15-SD-dep.ll
index df921e0..019ff61 100644
--- a/test/CodeGen/ARM/a15-SD-dep.ll
+++ b/test/CodeGen/ARM/a15-SD-dep.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -disable-a15-sd-optimization -verify-machineinstrs < %s  | FileCheck -check-prefix=DISABLED %s
-; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck -check-prefix=ENABLED %s
+; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -disable-a15-sd-optimization -verify-machineinstrs < %s  | FileCheck -check-prefix=CHECK-DISABLED %s
+; RUN: llc -O1 -mcpu=cortex-a15 -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-ENABLED %s
 
 ; CHECK-ENABLED-LABEL: t1:
 ; CHECK-DISABLED-LABEL: t1:
diff --git a/test/CodeGen/ARM/addrspacecast.ll b/test/CodeGen/ARM/addrspacecast.ll
new file mode 100644
index 0000000..2e98ba5
--- /dev/null
+++ b/test/CodeGen/ARM/addrspacecast.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=arm
+
+; Check that codegen for an addrspace cast succeeds without error.
+define <4 x i32 addrspace(1)*> @f (<4 x i32*> %x) {
+  %1 = addrspacecast <4 x i32*> %x to <4 x i32 addrspace(1)*>
+  ret <4 x i32 addrspace(1)*> %1
+}
diff --git a/test/CodeGen/ARM/aliases.ll b/test/CodeGen/ARM/aliases.ll
index d668334..f55ae10 100644
--- a/test/CodeGen/ARM/aliases.ll
+++ b/test/CodeGen/ARM/aliases.ll
@@ -1,15 +1,30 @@
-; RUN: llc < %s -mtriple=arm-linux-gnueabi -o %t
-; RUN: grep " = " %t   | count 5
-; RUN: grep globl %t | count 4
-; RUN: grep weak %t  | count 1
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | FileCheck %s
 
-@bar = external global i32
+; CHECK: .globl	test
+
+; CHECK: .globl	foo1
+; CHECK: foo1 = bar
+
+; CHECK: .globl	foo2
+; CHECK: foo2 = bar
+
+; CHECK: .weak	bar_f
+; CHECK: bar_f = foo_f
+
+; CHECK: bar_i = bar
+
+; CHECK: .globl	A
+; CHECK: A = bar
+
+@bar = global i32 42
 @foo1 = alias i32* @bar
 @foo2 = alias i32* @bar
 
 %FunTy = type i32()
 
-declare i32 @foo_f()
+define i32 @foo_f() {
+  ret i32 0
+}
 @bar_f = alias weak %FunTy* @foo_f
 
 @bar_i = alias internal i32* @bar
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index 0762070..88d797e 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=arm | FileCheck -check-prefix=ARM %s
 ; RUN: llc < %s -march=thumb | FileCheck -check-prefix=THUMB %s
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck -check-prefix=T2 %s
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck -check-prefix=V8 %s
 
 ; FIXME: The -march=thumb test doesn't change if -disable-peephole is specified.
 
@@ -39,6 +40,17 @@ tailrecurse:                                      ; preds = %sw.bb, %entry
   br i1 %tst, label %sw.bb, label %tailrecurse.switch
 
 tailrecurse.switch:                               ; preds = %tailrecurse
+; V8-LABEL: %tailrecurse.switch
+; V8: cmp
+; V8-NEXT: beq
+; V8-NEXT: %tailrecurse.switch
+; V8: cmp
+; V8-NEXT: beq
+; V8-NEXT: %tailrecurse.switch
+; V8: cmp
+; V8-NEXT: beq
+; V8-NEXT: b	
+; The trailing space in the last line checks that the branch is unconditional
   switch i32 %and, label %sw.epilog [
     i32 1, label %sw.bb
     i32 3, label %sw.bb6
@@ -73,6 +85,7 @@ sw.epilog:                                        ; preds = %tailrecurse.switch
 ; ARM: bar
 ; THUMB: bar
 ; T2: bar
+; V8-LABEL: bar:
 define internal zeroext i8 @bar(%struct.S* %x, %struct.S* nocapture %y) nounwind readonly {
 entry:
   %0 = getelementptr inbounds %struct.S* %x, i32 0, i32 1, i32 0
@@ -81,22 +94,32 @@ entry:
 ; ARM: ands
 ; THUMB: ands
 ; T2: ands
+; V8: ands
+; V8-NEXT: beq
   %3 = and i32 %2, 112
   %4 = icmp eq i32 %3, 0
   br i1 %4, label %return, label %bb
 
 bb:                                               ; preds = %entry
+; V8-NEXT: %bb
   %5 = getelementptr inbounds %struct.S* %y, i32 0, i32 1, i32 0
   %6 = load i8* %5, align 1
   %7 = zext i8 %6 to i32
 ; ARM: andsne
 ; THUMB: ands
 ; T2: andsne
+; V8: ands
+; V8-NEXT: beq
   %8 = and i32 %7, 112
   %9 = icmp eq i32 %8, 0
   br i1 %9, label %return, label %bb2
 
 bb2:                                              ; preds = %bb
+; V8-NEXT: %bb2
+; V8-NEXT: cmp
+; V8-NEXT: it	ne
+; V8-NEXT: cmpne
+; V8-NEXT: bne
   %10 = icmp eq i32 %3, 16
   %11 = icmp eq i32 %8, 16
   %or.cond = or i1 %10, %11
diff --git a/test/CodeGen/ARM/atomic-64bit.ll b/test/CodeGen/ARM/atomic-64bit.ll
index 8ec829c..0477d4f 100644
--- a/test/CodeGen/ARM/atomic-64bit.ll
+++ b/test/CodeGen/ARM/atomic-64bit.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf | FileCheck %s --check-prefix=CHECK-THUMB
+; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabihf -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-THUMB
 
 define i64 @test1(i64* %ptr, i64 %val) {
 ; CHECK-LABEL: test1:
@@ -175,28 +175,14 @@ define i64 @test7(i64* %ptr, i64 %val1, i64 %val2) {
   ret i64 %r
 }
 
-; Compiles down to cmpxchg
-; FIXME: Should compile to a single ldrexd
+; Compiles down to a single ldrexd
 define i64 @test8(i64* %ptr) {
 ; CHECK-LABEL: test8:
 ; CHECK: ldrexd [[REG1:(r[0-9]?[02468])]], [[REG2:(r[0-9]?[13579])]]
-; CHECK: cmp [[REG1]]
-; CHECK: cmpeq [[REG2]]
-; CHECK: bne
-; CHECK: strexd {{[a-z0-9]+}}, {{r[0-9]?[02468]}}, {{r[0-9]?[13579]}}
-; CHECK: cmp
-; CHECK: bne
 ; CHECK: dmb {{ish$}}
 
 ; CHECK-THUMB-LABEL: test8:
 ; CHECK-THUMB: ldrexd [[REG1:[a-z0-9]+]], [[REG2:[a-z0-9]+]]
-; CHECK-THUMB: cmp [[REG1]]
-; CHECK-THUMB: it eq
-; CHECK-THUMB: cmpeq [[REG2]]
-; CHECK-THUMB: bne
-; CHECK-THUMB: strexd {{[a-z0-9]+}}, {{[a-z0-9]+}}, {{[a-z0-9]+}}
-; CHECK-THUMB: cmp
-; CHECK-THUMB: bne
 ; CHECK-THUMB: dmb {{ish$}}
 
   %r = load atomic i64* %ptr seq_cst, align 8
diff --git a/test/CodeGen/ARM/atomic-load-store.ll b/test/CodeGen/ARM/atomic-load-store.ll
index 476b3dd..53c7184 100644
--- a/test/CodeGen/ARM/atomic-load-store.ll
+++ b/test/CodeGen/ARM/atomic-load-store.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s -check-prefix=ARM
 ; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=ARM
-; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s -check-prefix=THUMBTWO
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s -check-prefix=THUMBTWO
 ; RUN: llc < %s -mtriple=thumbv6-apple-ios | FileCheck %s -check-prefix=THUMBONE
 ; RUN  llc < %s -mtriple=armv4-apple-ios | FileCheck %s -check-prefix=ARMV4
 
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index 6e6b363..9a79c9f 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-T1
+; RUN: llc < %s -mtriple=thumbv6-apple-ios -verify-machineinstrs -mcpu=cortex-m0 | FileCheck %s --check-prefix=CHECK-T1
 
 define void @func(i32 %argc, i8** %argv) nounwind {
 entry:
@@ -24,78 +26,93 @@ entry:
   ; CHECK: ldrex
   ; CHECK: add
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_add_4
   %0 = atomicrmw add i32* %val1, i32 %tmp monotonic
 	store i32 %0, i32* %old
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_sub_4
   %1 = atomicrmw sub i32* %val2, i32 30 monotonic
 	store i32 %1, i32* %old
   ; CHECK: ldrex
   ; CHECK: add
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_add_4
   %2 = atomicrmw add i32* %val2, i32 1 monotonic
 	store i32 %2, i32* %old
   ; CHECK: ldrex
   ; CHECK: sub
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_sub_4
   %3 = atomicrmw sub i32* %val2, i32 1 monotonic
 	store i32 %3, i32* %old
   ; CHECK: ldrex
   ; CHECK: and
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_and_4
   %4 = atomicrmw and i32* %andt, i32 4080 monotonic
 	store i32 %4, i32* %old
   ; CHECK: ldrex
   ; CHECK: or
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_or_4
   %5 = atomicrmw or i32* %ort, i32 4080 monotonic
 	store i32 %5, i32* %old
   ; CHECK: ldrex
   ; CHECK: eor
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_xor_4
   %6 = atomicrmw xor i32* %xort, i32 4080 monotonic
 	store i32 %6, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_min_4
   %7 = atomicrmw min i32* %val2, i32 16 monotonic
 	store i32 %7, i32* %old
 	%neg = sub i32 0, 1
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_min_4
   %8 = atomicrmw min i32* %val2, i32 %neg monotonic
 	store i32 %8, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_max_4
   %9 = atomicrmw max i32* %val2, i32 1 monotonic
 	store i32 %9, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_max_4
   %10 = atomicrmw max i32* %val2, i32 0 monotonic
 	store i32 %10, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_4
   %11 = atomicrmw umin i32* %val2, i32 16 monotonic
 	store i32 %11, i32* %old
 	%uneg = sub i32 0, 1
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_4
   %12 = atomicrmw umin i32* %val2, i32 %uneg monotonic
 	store i32 %12, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_4
   %13 = atomicrmw umax i32* %val2, i32 1 monotonic
 	store i32 %13, i32* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_4
   %14 = atomicrmw umax i32* %val2, i32 0 monotonic
 	store i32 %14, i32* %old
 
@@ -110,22 +127,26 @@ entry:
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_2
   %0 = atomicrmw umin i16* %val, i16 16 monotonic
   store i16 %0, i16* %old
   %uneg = sub i16 0, 1
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_2
   %1 = atomicrmw umin i16* %val, i16 %uneg monotonic
   store i16 %1, i16* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_2
   %2 = atomicrmw umax i16* %val, i16 1 monotonic
   store i16 %2, i16* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_2
   %3 = atomicrmw umax i16* %val, i16 0 monotonic
   store i16 %3, i16* %old
   ret void
@@ -139,22 +160,26 @@ entry:
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_1
   %0 = atomicrmw umin i8* %val, i8 16 monotonic
   store i8 %0, i8* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umin_1
   %uneg = sub i8 0, 1
   %1 = atomicrmw umin i8* %val, i8 %uneg monotonic
   store i8 %1, i8* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_1
   %2 = atomicrmw umax i8* %val, i8 1 monotonic
   store i8 %2, i8* %old
   ; CHECK: ldrex
   ; CHECK: cmp
   ; CHECK: strex
+  ; CHECK-T1: blx ___sync_fetch_and_umax_1
   %3 = atomicrmw umax i8* %val, i8 0 monotonic
   store i8 %3, i8* %old
   ret void
diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll
new file mode 100644
index 0000000..3f93929
--- /dev/null
+++ b/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -0,0 +1,1344 @@
+; RUN: llc -mtriple=armv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv8-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define i8 @test_atomic_load_add_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i8:
+   %old = atomicrmw add i8* @var8, i8 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_add_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i16:
+   %old = atomicrmw add i16* @var16, i16 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_add_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i32:
+   %old = atomicrmw add i32* @var32, i32 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: add{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_add_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_add_i64:
+   %old = atomicrmw add i64* @var64, i64 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: adds [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: adc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_sub_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i8:
+   %old = atomicrmw sub i8* @var8, i8 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_sub_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i16:
+   %old = atomicrmw sub i16* @var16, i16 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_sub_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i32:
+   %old = atomicrmw sub i32* @var32, i32 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: sub{{s?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_sub_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_sub_i64:
+   %old = atomicrmw sub i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: subs [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: sbc{{(\.w)?}}  [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_and_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i8:
+   %old = atomicrmw and i8* @var8, i8 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_and_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i16:
+   %old = atomicrmw and i16* @var16, i16 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_and_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i32:
+   %old = atomicrmw and i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: and{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_and_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_and_i64:
+   %old = atomicrmw and i64* @var64, i64 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: and{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: and{{(\.w)?}} [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_or_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_or_i8:
+   %old = atomicrmw or i8* @var8, i8 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_or_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_or_i16:
+   %old = atomicrmw or i16* @var16, i16 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_or_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_or_i32:
+   %old = atomicrmw or i32* @var32, i32 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: orr{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_or_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_or_i64:
+   %old = atomicrmw or i64* @var64, i64 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: orr{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: orr{{(\.w)?}} [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_xor_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xor_i8:
+   %old = atomicrmw xor i8* @var8, i8 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_xor_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xor_i16:
+   %old = atomicrmw xor i16* @var16, i16 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_xor_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xor_i32:
+   %old = atomicrmw xor i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: eor{{(\.w)?}} [[NEW:r[0-9]+]], r[[OLD]], r0
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], [[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_xor_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xor_i64:
+   %old = atomicrmw xor i64* @var64, i64 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: eor{{(\.w)?}} [[NEW1:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: eor{{(\.w)?}} [[NEW2:r[0-9]+]], r[[OLD2]], r1
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], [[NEW1]], [[NEW2]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_xchg_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xchg_i8:
+   %old = atomicrmw xchg i8* @var8, i8 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_xchg_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xchg_i16:
+   %old = atomicrmw xchg i16* @var16, i16 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_xchg_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xchg_i32:
+   %old = atomicrmw xchg i32* @var32, i32 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r0, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_xchg_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_xchg_i64:
+   %old = atomicrmw xchg i64* @var64, i64 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_min_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_min_i8:
+   %old = atomicrmw min i8* @var8, i8 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK-NEXT: sxtb r[[OLDX:[0-9]+]], r[[OLD]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLDX]], r0
+; Thumb mode: it ge
+; CHECK:      movge r[[OLDX]], r0
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_min_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_min_i16:
+   %old = atomicrmw min i16* @var16, i16 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK-NEXT: sxth r[[OLDX:[0-9]+]], r[[OLD]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLDX]], r0
+; Thumb mode: it ge
+; CHECK:      movge r[[OLDX]], r0
+; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_min_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_min_i32:
+   %old = atomicrmw min i32* @var32, i32 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it lt
+; CHECK:      movlt r[[NEW]], r[[OLD]]
+; CHECK-NEXT: strex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_min_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_min_i64:
+   %old = atomicrmw min i64* @var64, i64 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
+; CHECK-NEXT: blt .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_max_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_max_i8:
+   %old = atomicrmw max i8* @var8, i8 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK-NEXT: sxtb r[[OLDX:[0-9]+]], r[[OLD]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLDX]], r0
+; Thumb mode: it le
+; CHECK:      movle r[[OLDX]], r0
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_max_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_max_i16:
+   %old = atomicrmw max i16* @var16, i16 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+; CHECK-NEXT: sxth r[[OLDX:[0-9]+]], r[[OLD]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLDX]], r0
+; Thumb mode: it le
+; CHECK:      movle r[[OLDX]], r0
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[OLDX]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_max_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_max_i32:
+   %old = atomicrmw max i32* @var32, i32 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it gt
+; CHECK:      movgt r[[NEW]], r[[OLD]]
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_max_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_max_i64:
+   %old = atomicrmw max i64* @var64, i64 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
+; CHECK-NEXT: bge .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_umin_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umin_i8:
+   %old = atomicrmw umin i8* @var8, i8 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it lo
+; CHECK:      movlo r[[NEW]], r[[OLD]]
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_umin_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umin_i16:
+   %old = atomicrmw umin i16* @var16, i16 %offset acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it lo
+; CHECK:      movlo r[[NEW]], r[[OLD]]
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_umin_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umin_i32:
+   %old = atomicrmw umin i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it lo
+; CHECK:      movlo r[[NEW]], r[[OLD]]
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_umin_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umin_i64:
+   %old = atomicrmw umin i64* @var64, i64 %offset acq_rel
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
+; CHECK-NEXT: blo .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_umax_i8(i8 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umax_i8:
+   %old = atomicrmw umax i8* @var8, i8 %offset acq_rel
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it hi
+; CHECK:      movhi r[[NEW]], r[[OLD]]
+; CHECK-NEXT: stlexb [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_load_umax_i16(i16 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umax_i16:
+   %old = atomicrmw umax i16* @var16, i16 %offset monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it hi
+; CHECK:      movhi r[[NEW]], r[[OLD]]
+; CHECK-NEXT: strexh [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_load_umax_i32(i32 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umax_i32:
+   %old = atomicrmw umax i32* @var32, i32 %offset seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: mov r[[NEW:[0-9]+]], r0
+; CHECK-NEXT: cmp r[[OLD]], r0
+; Thumb mode: it hi
+; CHECK:      movhi r[[NEW]], r[[OLD]]
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r[[NEW]], [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_load_umax_i64(i64 %offset) nounwind {
+; CHECK-LABEL: test_atomic_load_umax_i64:
+   %old = atomicrmw umax i64* @var64, i64 %offset release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd r[[OLD1:[0-9]+]], r[[OLD2:[0-9]+]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: subs [[NEW:r[0-9]+]], r[[OLD1]], r0
+; CHECK-NEXT: sbcs{{(\.w)?}} [[NEW]], r[[OLD2]], r1
+; CHECK-NEXT: bhs .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+; CHECK-NEXT: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD1]]
+; CHECK-NEXT: mov r1, r[[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i8:
+   %old = cmpxchg i8* @var8, i8 %wanted, i8 %new acquire
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexb r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLD]], r0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+  ; As above, r1 is a reasonable guess.
+; CHECK-NEXT: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i8 %old
+}
+
+define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i16:
+   %old = cmpxchg i16* @var16, i16 %wanted, i16 %new seq_cst
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK: movt r[[ADDR]], :upper16:var16
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldaexh r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLD]], r0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+  ; As above, r1 is a reasonable guess.
+; CHECK-NEXT: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i16 %old
+}
+
+define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i32:
+   %old = cmpxchg i32* @var32, i32 %wanted, i32 %new release
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var32
+; CHECK: movt r[[ADDR]], :upper16:var32
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrex r[[OLD:[0-9]+]], [r[[ADDR]]]
+  ; r0 below is a reasonable guess but could change: it certainly comes into the
+  ;  function there.
+; CHECK-NEXT: cmp r[[OLD]], r0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+  ; As above, r1 is a reasonable guess.
+; CHECK-NEXT: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, r[[OLD]]
+   ret i32 %old
+}
+
+define i64 @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
+; CHECK-LABEL: test_atomic_cmpxchg_i64:
+   %old = cmpxchg i64* @var64, i64 %wanted, i64 %new monotonic
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+; CHECK-NEXT: ldrexd [[OLD1:r[0-9]+|lr]], [[OLD2:r[0-9]+|lr]], [r[[ADDR]]]
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK-NEXT: cmp   [[OLD1]], r0
+; Thumb mode: it eq
+; CHECK:      cmpeq [[OLD2]], r1
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: BB#2:
+  ; As above, r2, r3 is a reasonable guess.
+; CHECK-NEXT: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+; CHECK: mov r0, [[OLD1]]
+; CHECK-NEXT: mov r1, [[OLD2]]
+   ret i64 %old
+}
+
+define i8 @test_atomic_load_monotonic_i8() nounwind {
+; CHECK-LABEL: test_atomic_load_monotonic_i8:
+  %val = load atomic i8* @var8 monotonic, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK: ldrb r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret i8 %val
+}
+
+define i8 @test_atomic_load_monotonic_regoff_i8(i64 %base, i64 %off) nounwind {
+; CHECK-LABEL: test_atomic_load_monotonic_regoff_i8:
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i8*
+
+  %val = load atomic i8* %addr monotonic, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldrb r0, [r0, r2]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret i8 %val
+}
+
+define i8 @test_atomic_load_acquire_i8() nounwind {
+; CHECK-LABEL: test_atomic_load_acquire_i8:
+  %val = load atomic i8* @var8 acquire, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldab r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret i8 %val
+}
+
+define i8 @test_atomic_load_seq_cst_i8() nounwind {
+; CHECK-LABEL: test_atomic_load_seq_cst_i8:
+  %val = load atomic i8* @var8 seq_cst, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldab r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret i8 %val
+}
+
+define i16 @test_atomic_load_monotonic_i16() nounwind {
+; CHECK-LABEL: test_atomic_load_monotonic_i16:
+  %val = load atomic i16* @var16 monotonic, align 2
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var16
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldrh r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret i16 %val
+}
+
+define i32 @test_atomic_load_monotonic_regoff_i32(i64 %base, i64 %off) nounwind {
+; CHECK-LABEL: test_atomic_load_monotonic_regoff_i32:
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i32*
+
+  %val = load atomic i32* %addr monotonic, align 4
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldr r0, [r0, r2]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret i32 %val
+}
+
+define i64 @test_atomic_load_seq_cst_i64() nounwind {
+; CHECK-LABEL: test_atomic_load_seq_cst_i64:
+  %val = load atomic i64* @var64 seq_cst, align 8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var64
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldaexd r0, r1, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret i64 %val
+}
+
+define void @test_atomic_store_monotonic_i8(i8 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_monotonic_i8:
+  store atomic i8 %val, i8* @var8 monotonic, align 1
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK: strb r0, [r[[ADDR]]]
+
+  ret void
+}
+
+define void @test_atomic_store_monotonic_regoff_i8(i64 %base, i64 %off, i8 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_monotonic_regoff_i8:
+
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i8*
+
+  store atomic i8 %val, i8* %addr monotonic, align 1
+; CHECK: ldrb{{(\.w)?}} [[VAL:r[0-9]+]], [sp]
+; CHECK: strb [[VAL]], [r0, r2]
+
+  ret void
+}
+
+define void @test_atomic_store_release_i8(i8 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_release_i8:
+  store atomic i8 %val, i8* @var8 release, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: stlb r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret void
+}
+
+define void @test_atomic_store_seq_cst_i8(i8 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_seq_cst_i8:
+  store atomic i8 %val, i8* @var8 seq_cst, align 1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: stlb r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret void
+}
+
+define void @test_atomic_store_monotonic_i16(i16 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_monotonic_i16:
+  store atomic i16 %val, i16* @var16 monotonic, align 2
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var16
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movt r[[ADDR]], :upper16:var16
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: strh r0, [r[[ADDR]]]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+  ret void
+}
+
+define void @test_atomic_store_monotonic_regoff_i32(i64 %base, i64 %off, i32 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_monotonic_regoff_i32:
+
+  %addr_int = add i64 %base, %off
+  %addr = inttoptr i64 %addr_int to i32*
+
+  store atomic i32 %val, i32* %addr monotonic, align 4
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: ldr [[VAL:r[0-9]+]], [sp]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: str [[VAL]], [r0, r2]
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret void
+}
+
+define void @test_atomic_store_release_i64(i64 %val) nounwind {
+; CHECK-LABEL: test_atomic_store_release_i64:
+  store atomic i64 %val, i64* @var64 release, align 8
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+; CHECK: movw r[[ADDR:[0-9]+]], :lower16:var64
+; CHECK: movt r[[ADDR]], :upper16:var64
+
+; CHECK: .LBB{{[0-9]+}}_1:
+  ; r0, r1 below is a reasonable guess but could change: it certainly comes into the
+  ; function there.
+; CHECK: stlexd [[STATUS:r[0-9]+]], r0, r1, [r[[ADDR]]]
+; CHECK-NEXT: cmp [[STATUS]], #0
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
+; CHECK-NOT: dmb
+; CHECK-NOT: mcr
+
+  ret void
+}
+
+define i32 @not.barriers(i32* %var, i1 %cond) {
+; CHECK-LABEL: not.barriers:
+  br i1 %cond, label %atomic_ver, label %simple_ver
+simple_ver:
+  %oldval = load i32* %var
+  %newval = add nsw i32 %oldval, -1
+  store i32 %newval, i32* %var
+  br label %somewhere
+atomic_ver:
+  fence seq_cst
+  %val = atomicrmw add i32* %var, i32 -1 monotonic
+  fence seq_cst
+  br label %somewhere
+; CHECK: dmb
+; CHECK: ldrex
+; CHECK: dmb
+  ; The key point here is that the second dmb isn't immediately followed by the
+  ; simple_ver basic block, which LLVM attempted to do when DMB had been marked
+  ; with isBarrier. For now, look for something that looks like "somewhere".
+; CHECK-NEXT: mov
+somewhere:
+  %combined = phi i32 [ %val, %atomic_ver ], [ %newval, %simple_ver]
+  ret i32 %combined
+}
diff --git a/test/CodeGen/ARM/build-attributes-encoding.s b/test/CodeGen/ARM/build-attributes-encoding.s
new file mode 100644
index 0000000..5ad51b2
--- /dev/null
+++ b/test/CodeGen/ARM/build-attributes-encoding.s
@@ -0,0 +1,85 @@
+// This tests that ARM attributes are properly encoded.
+
+// RUN: llvm-mc < %s -triple=arm-linux-gnueabi -filetype=obj -o - \
+// RUN:   | llvm-readobj -s -sd | FileCheck %s
+
+// Tag_CPU_name (=5)
+.cpu Cortex-A8
+
+// Tag_CPU_arch (=6)
+.eabi_attribute 6, 10
+
+// Tag_arch_profile (=7)
+.eabi_attribute 7, 'A'
+
+// Tag_ARM_ISA_use (=8)
+.eabi_attribute 8, 1
+
+// Tag_THUMB_ISA_use (=9)
+.eabi_attribute 9, 2
+
+// Tag_FP_arch (=10)
+.fpu vfpv3
+
+// Tag_Advanced_SIMD_arch (=12)
+.eabi_attribute 12, 2
+
+// Tag_ABI_FP_denormal (=20)
+.eabi_attribute 20, 1
+
+// Tag_ABI_FP_exceptions (=21)
+.eabi_attribute 21, 1
+
+// Tag_ABI_FP_number_model (=23)
+.eabi_attribute 23, 1
+
+// Tag_ABI_align_needed (=24)
+.eabi_attribute 24, 1
+
+// Tag_ABI_align_preserved (=25)
+.eabi_attribute 25, 1
+
+// Tag_ABI_HardFP_use (=27)
+.eabi_attribute 27, 0
+
+// Tag_ABI_VFP_args (=28)
+.eabi_attribute 28, 1
+
+// Tag_FP_HP_extension (=36)
+.eabi_attribute 36, 1
+
+// Tag_MPextension_use (=42)
+.eabi_attribute 42, 1
+
+// Tag_DIV_use (=44)
+.eabi_attribute 44, 2
+
+// Tag_Virtualization_use (=68)
+.eabi_attribute 68, 3
+
+// Check that values > 128 are encoded properly
+.eabi_attribute 110, 160
+
+// Check that tags > 128 are encoded properly
+.eabi_attribute 129, 1
+.eabi_attribute 250, 1
+
+// CHECK:        Section {
+// CHECK:          Name: .ARM.attributes
+// CHECK-NEXT:     Type: SHT_ARM_ATTRIBUTES
+// CHECK-NEXT:     Flags [ (0x0)
+// CHECK-NEXT:     ]
+// CHECK-NEXT:     Address: 0x0
+// CHECK-NEXT:     Offset: 0x34
+// CHECK-NEXT:     Size: 70
+// CHECK-NEXT:     Link: 0
+// CHECK-NEXT:     Info: 0
+// CHECK-NEXT:     AddressAlignment: 1
+// CHECK-NEXT:     EntrySize: 0
+// CHECK-NEXT:     SectionData (
+// CHECK-NEXT:       0000: 41450000 00616561 62690001 3B000000
+// CHECK-NEXT:       0010: 05434F52 5445582D 41380006 0A074108
+// CHECK-NEXT:       0020: 0109020A 030C0214 01150117 01180119
+// CHECK-NEXT:       0030: 011B001C 0124012A 012C0244 036EA001
+// CHECK-NEXT:       0040: 810101FA 0101
+// CHECK-NEXT:     )
diff --git a/test/CodeGen/ARM/byval_load_align.ll b/test/CodeGen/ARM/byval_load_align.ll
new file mode 100644
index 0000000..2c0910c
--- /dev/null
+++ b/test/CodeGen/ARM/byval_load_align.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple thumbv7-apple-ios -O1 | FileCheck %s
+
+; rdar://15144402
+; Make sure we don't assume 4-byte alignment when loading from a byval argument
+; with alignment of 2.
+; CHECK: ldr r1, [r[[REG:[0-9]+]]]
+; CHECK: ldr r2, [r[[REG]], #4]
+; CHECK: ldr r3, [r[[REG]], #8]
+; CHECK-NOT: ldm
+; CHECK: .align	1 @ @sID
+
+%struct.ModuleID = type { [32 x i8], [32 x i8], i16 }
+
+@sID = internal constant %struct.ModuleID { [32 x i8] c"TEST\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", [32 x i8] c"1.0\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", i16 23 }, align 2
+
+; Function Attrs: nounwind ssp
+define void @Client() #0 {
+entry:
+  tail call void @Logger(i8 signext 97, %struct.ModuleID* byval @sID) #2
+  ret void
+}
+
+declare void @Logger(i8 signext, %struct.ModuleID* byval) #1
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/ARM/coalesce-dbgvalue.ll b/test/CodeGen/ARM/coalesce-dbgvalue.ll
index d4be6ee..86106a0 100644
--- a/test/CodeGen/ARM/coalesce-dbgvalue.ll
+++ b/test/CodeGen/ARM/coalesce-dbgvalue.ll
@@ -17,7 +17,7 @@ target triple = "thumbv7-apple-ios3.0.0"
 ; Function Attrs: nounwind ssp
 define i32 @pr16110() #0 {
 for.cond1.preheader:
-  store i32 0, i32* @c, align 4, !dbg !21, !tbaa !23
+  store i32 0, i32* @c, align 4, !dbg !21
   br label %for.cond1.outer, !dbg !26
 
 for.cond1:                                        ; preds = %for.end9, %for.cond1.outer
@@ -26,9 +26,9 @@ for.cond1:                                        ; preds = %for.end9, %for.cond
   br i1 %cmp, label %for.body2, label %for.end9, !dbg !26
 
 for.body2:                                        ; preds = %for.cond1
-  store i32 %storemerge11, i32* @b, align 4, !dbg !26, !tbaa !23
+  store i32 %storemerge11, i32* @b, align 4, !dbg !26
   tail call void @llvm.dbg.value(metadata !27, i64 0, metadata !11), !dbg !28
-  %0 = load i64* @a, align 8, !dbg !29, !tbaa !30
+  %0 = load i64* @a, align 8, !dbg !29
   %xor = xor i64 %0, %e.1.ph, !dbg !29
   %conv3 = trunc i64 %xor to i32, !dbg !29
   tail call void @llvm.dbg.value(metadata !{i32 %conv3}, i64 0, metadata !10), !dbg !29
@@ -44,7 +44,7 @@ land.end:                                         ; preds = %land.rhs, %for.body
   %1 = phi i1 [ false, %for.body2 ], [ %tobool5, %land.rhs ]
   %land.ext = zext i1 %1 to i32
   %call6 = tail call i32 bitcast (i32 (...)* @fn2 to i32 (i32, i32*)*)(i32 %land.ext, i32* null) #3
-  %2 = load i32* @b, align 4, !dbg !26, !tbaa !23
+  %2 = load i32* @b, align 4, !dbg !26
   %inc8 = add nsw i32 %2, 1, !dbg !26
   %phitmp = and i64 %xor, 4294967295, !dbg !26
   br label %for.cond1.outer, !dbg !26
@@ -52,7 +52,7 @@ land.end:                                         ; preds = %land.rhs, %for.body
 for.cond1.outer:                                  ; preds = %land.end, %for.cond1.preheader
   %storemerge11.ph = phi i32 [ %inc8, %land.end ], [ 0, %for.cond1.preheader ]
   %e.1.ph = phi i64 [ %phitmp, %land.end ], [ 0, %for.cond1.preheader ]
-  %3 = load i32* @d, align 4, !dbg !31, !tbaa !23
+  %3 = load i32* @d, align 4, !dbg !31
   %tobool10 = icmp eq i32 %3, 0, !dbg !31
   br label %for.cond1
 
@@ -60,7 +60,7 @@ for.end9:                                         ; preds = %for.cond1
   br i1 %tobool10, label %if.end, label %for.cond1, !dbg !31
 
 if.end:                                           ; preds = %for.end9
-  store i32 %storemerge11, i32* @b, align 4, !dbg !26, !tbaa !23
+  store i32 %storemerge11, i32* @b, align 4, !dbg !26
   ret i32 0, !dbg !32
 }
 
@@ -71,12 +71,13 @@ declare i32 @fn3(...) #1
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata) #2
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!33}
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 182024) (llvm/trunk 182023)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !15, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/d/b/pr16110.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"pr16110.c", metadata !"/d/b"}
@@ -84,7 +85,7 @@ attributes #3 = { nounwind }
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"pr16110", metadata !"pr16110", metadata !"", i32 7, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @pr16110, null, null, metadata !9, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [pr16110]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/d/b/pr16110.c]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8}
 !8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
 !9 = metadata !{metadata !10, metadata !11}
@@ -101,13 +102,10 @@ attributes #3 = { nounwind }
 !20 = metadata !{i32 786484, i32 0, null, metadata !"d", metadata !"d", metadata !"", metadata !5, i32 4, metadata !8, i32 0, i32 1, i32* @d, null} ; [ DW_TAG_variable ] [d] [line 4] [def]
 !21 = metadata !{i32 10, i32 0, metadata !22, null}
 !22 = metadata !{i32 786443, metadata !1, metadata !4, i32 10, i32 0, i32 0} ; [ DW_TAG_lexical_block ] [/d/b/pr16110.c]
-!23 = metadata !{metadata !"int", metadata !24}
-!24 = metadata !{metadata !"omnipotent char", metadata !25}
-!25 = metadata !{metadata !"Simple C/C++ TBAA"}
 !26 = metadata !{i32 12, i32 0, metadata !13, null}
 !27 = metadata !{i32* null}
 !28 = metadata !{i32 13, i32 0, metadata !12, null}
 !29 = metadata !{i32 14, i32 0, metadata !12, null}
-!30 = metadata !{metadata !"long long", metadata !24}
 !31 = metadata !{i32 16, i32 0, metadata !4, null}
 !32 = metadata !{i32 18, i32 0, metadata !4, null}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/constantfp.ll b/test/CodeGen/ARM/constantfp.ll
new file mode 100644
index 0000000..974bdd7
--- /dev/null
+++ b/test/CodeGen/ARM/constantfp.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=armv7 -mattr=+neon -mcpu=swift %s -o - | FileCheck %s
+; RUN: llc -mtriple=armv7 -mattr=+neon -mcpu=cortex-a8 %s -o - | FileCheck --check-prefix=CHECK-NONEONFP %s
+; RUN: llc -mtriple=armv7 -mattr=-neon -mcpu=cortex-a8 %s -o - | FileCheck --check-prefix=CHECK-NONEON %s
+
+define arm_aapcs_vfpcc float @test_vmov_f32() {
+; CHECK-LABEL: test_vmov_f32:
+; CHECK: vmov.f32 d0, #1.0
+
+; CHECK-NONEONFP: vmov.f32 s0, #1.0
+  ret float 1.0
+}
+
+define arm_aapcs_vfpcc float @test_vmov_imm() {
+; CHECK-LABEL: test_vmov_imm:
+; CHECK: vmov.i32 d0, #0
+
+; CHECK-NONEON-LABEL: test_vmov_imm:
+; CHECK_NONEON: vldr s0, {{.?LCPI[0-9]+_[0-9]+}}
+  ret float 0.0
+}
+
+define arm_aapcs_vfpcc float @test_vmvn_imm() {
+; CHECK-LABEL: test_vmvn_imm:
+; CHECK: vmvn.i32 d0, #0xb0000000
+
+; CHECK-NONEON-LABEL: test_vmvn_imm:
+; CHECK_NONEON: vldr s0, {{.?LCPI[0-9]+_[0-9]+}}
+  ret float 8589934080.0
+}
+
+define arm_aapcs_vfpcc double @test_vmov_f64() {
+; CHECK-LABEL: test_vmov_f64:
+; CHECK: vmov.f64 d0, #1.0
+
+; CHECK-NONEON-LABEL: test_vmov_f64:
+; CHECK_NONEON: vmov.f64 d0, #1.0
+
+  ret double 1.0
+}
+
+define arm_aapcs_vfpcc double @test_vmov_double_imm() {
+; CHECK-LABEL: test_vmov_double_imm:
+; CHECK: vmov.i32 d0, #0
+
+; CHECK-NONEON-LABEL: test_vmov_double_imm:
+; CHECK_NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
+  ret double 0.0
+}
+
+define arm_aapcs_vfpcc double @test_vmvn_double_imm() {
+; CHECK-LABEL: test_vmvn_double_imm:
+; CHECK: vmvn.i32 d0, #0xb0000000
+
+; CHECK-NONEON-LABEL: test_vmvn_double_imm:
+; CHECK_NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
+  ret double 0x4fffffff4fffffff
+}
+
+; Make sure we don't ignore the high half of 64-bit values when deciding whether
+; a vmov/vmvn is possible.
+define arm_aapcs_vfpcc double @test_notvmvn_double_imm() {
+; CHECK-LABEL: test_notvmvn_double_imm:
+; CHECK: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
+
+; CHECK-NONEON-LABEL: test_notvmvn_double_imm:
+; CHECK_NONEON: vldr d0, {{.?LCPI[0-9]+_[0-9]+}}
+  ret double 0x4fffffffffffffff
+}
diff --git a/test/CodeGen/ARM/dagcombine-concatvector.ll b/test/CodeGen/ARM/dagcombine-concatvector.ll
index d8c6c64..2927ea2 100644
--- a/test/CodeGen/ARM/dagcombine-concatvector.ll
+++ b/test/CodeGen/ARM/dagcombine-concatvector.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7s-apple-ios3.0.0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7s-apple-ios3.0.0 -mcpu=generic | FileCheck %s
 
 ; PR15525
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/ARM/darwin-eabi.ll b/test/CodeGen/ARM/darwin-eabi.ll
new file mode 100644
index 0000000..f2cde71
--- /dev/null
+++ b/test/CodeGen/ARM/darwin-eabi.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=thumbv7m-apple-darwin -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-M3
+; RUN: llc -mtriple=thumbv7em-apple-darwin -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-M4
+; RUN: llc -mtriple=thumbv7-apple-darwin -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-M3
+; RUN: llc -mtriple=thumbv7-apple-darwin -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-M4
+
+define float @float_op(float %lhs, float %rhs) {
+  %sum = fadd float %lhs, %rhs
+  ret float %sum
+; CHECK-M3-LABEL: float_op:
+; CHECK-M3: blx ___addsf3
+
+; CHECK-M4-LABEL: float_op:
+; CHECK-M4: vadd.f32
+}
+
+define double @double_op(double %lhs, double %rhs) {
+  %sum = fadd double %lhs, %rhs
+  ret double %sum
+; CHECK-M3-LABEL: double_op:
+; CHECK-M3: blx ___adddf3
+
+; CHECK-M4-LABEL: double_op:
+; CHECK-M4: blx ___adddf3
+}
diff --git a/test/CodeGen/ARM/debug-info-arg.ll b/test/CodeGen/ARM/debug-info-arg.ll
index 89ccb20..e8bf3ba 100644
--- a/test/CodeGen/ARM/debug-info-arg.ll
+++ b/test/CodeGen/ARM/debug-info-arg.ll
@@ -30,15 +30,16 @@ declare void @foobar(i64, i64)
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!33}
 
 !0 = metadata !{i32 786449, metadata !32, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !4, metadata !4, metadata !30, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !2, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, null, null, metadata !31, i32 11} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !2, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 11, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%struct.tag_s*, %struct.tag_s*, i64, i64, %struct.tag_s*, %struct.tag_s*)* @foo, null, null, metadata !31, i32 11} ; [ DW_TAG_subprogram ] [line 11] [def] [foo]
 !2 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !32, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !32, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !2, i32 16777227, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
 !6 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_pointer_type ]
-!7 = metadata !{i32 786451, metadata !32, metadata !0, metadata !"tag_s", i32 5, i64 96, i64 32, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!7 = metadata !{i32 786451, metadata !32, metadata !0, metadata !"tag_s", i32 5, i64 96, i64 32, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [tag_s] [line 5, size 96, align 32, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !11, metadata !12}
 !9 = metadata !{i32 786445, metadata !32, metadata !7, metadata !"x", i32 6, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
 !10 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
@@ -64,3 +65,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !30 = metadata !{metadata !1}
 !31 = metadata !{metadata !5, metadata !13, metadata !14, metadata !17, metadata !18, metadata!19}
 !32 = metadata !{metadata !"one.c", metadata !"/Volumes/Athwagate/R10048772"}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-blocks.ll b/test/CodeGen/ARM/debug-info-blocks.ll
index bd55786..6cbe4b4 100644
--- a/test/CodeGen/ARM/debug-info-blocks.ll
+++ b/test/CodeGen/ARM/debug-info-blocks.ll
@@ -93,37 +93,38 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!162}
 
 !0 = metadata !{i32 786449, metadata !153, i32 16, metadata !"Apple clang version 2.1", i1 false, metadata !"", i32 2, metadata !147, metadata !26, metadata !148, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786433, metadata !160, metadata !0, metadata !"", i32 248, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !3, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!1 = metadata !{i32 786436, metadata !160, metadata !0, metadata !"", i32 248, i64 32, i64 32, i32 0, i32 0, null, metadata !3, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 248, size 32, align 32, offset 0] [def] [from ]
 !2 = metadata !{i32 786473, metadata !160} ; [ DW_TAG_file_type ]
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786472, metadata !"Ver1", i64 0} ; [ DW_TAG_enumerator ]
-!5 = metadata !{i32 786433, metadata !160, metadata !0, metadata !"Mode", i32 79, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !7, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!5 = metadata !{i32 786436, metadata !160, metadata !0, metadata !"Mode", i32 79, i64 32, i64 32, i32 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [Mode] [line 79, size 32, align 32, offset 0] [def] [from ]
 !6 = metadata !{i32 786473, metadata !161} ; [ DW_TAG_file_type ]
 !7 = metadata !{metadata !8}
 !8 = metadata !{i32 786472, metadata !"One", i64 0} ; [ DW_TAG_enumerator ]
-!9 = metadata !{i32 786433, metadata !149, metadata !0, metadata !"", i32 15, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !11, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!9 = metadata !{i32 786436, metadata !149, metadata !0, metadata !"", i32 15, i64 32, i64 32, i32 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 15, size 32, align 32, offset 0] [def] [from ]
 !10 = metadata !{i32 786473, metadata !149} ; [ DW_TAG_file_type ]
 !11 = metadata !{metadata !12, metadata !13}
 !12 = metadata !{i32 786472, metadata !"Unknown", i64 0} ; [ DW_TAG_enumerator ]
 !13 = metadata !{i32 786472, metadata !"Known", i64 1} ; [ DW_TAG_enumerator ]
-!14 = metadata !{i32 786433, metadata !150, metadata !0, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !16, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!14 = metadata !{i32 786436, metadata !150, metadata !0, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !16, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
 !15 = metadata !{i32 786473, metadata !150} ; [ DW_TAG_file_type ]
 !16 = metadata !{metadata !17, metadata !18}
 !17 = metadata !{i32 786472, metadata !"Single", i64 0} ; [ DW_TAG_enumerator ]
 !18 = metadata !{i32 786472, metadata !"Double", i64 1} ; [ DW_TAG_enumerator ]
-!19 = metadata !{i32 786433, metadata !151, metadata !0, metadata !"", i32 14, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !21, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
+!19 = metadata !{i32 786436, metadata !151, metadata !0, metadata !"", i32 14, i64 32, i64 32, i32 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 14, size 32, align 32, offset 0] [def] [from ]
 !20 = metadata !{i32 786473, metadata !151} ; [ DW_TAG_file_type ]
 !21 = metadata !{metadata !22}
 !22 = metadata !{i32 786472, metadata !"Eleven", i64 0} ; [ DW_TAG_enumerator ]
-!23 = metadata !{i32 786478, metadata !152, metadata !24, metadata !"foobar_func_block_invoke_0", metadata !"foobar_func_block_invoke_0", metadata !"", i32 609, metadata !25, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (i8*, %0*, [4 x i32], [4 x i32])* @foobar_func_block_invoke_0, null, null, null, i32 609} ; [ DW_TAG_subprogram ]
+!23 = metadata !{i32 786478, metadata !152, metadata !24, metadata !"foobar_func_block_invoke_0", metadata !"foobar_func_block_invoke_0", metadata !"", i32 609, metadata !25, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*, %0*, [4 x i32], [4 x i32])* @foobar_func_block_invoke_0, null, null, null, i32 609} ; [ DW_TAG_subprogram ] [line 609] [local] [def] [foobar_func_block_invoke_0]
 !24 = metadata !{i32 786473, metadata !152} ; [ DW_TAG_file_type ]
-!25 = metadata !{i32 786453, metadata !152, metadata !24, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !26, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!25 = metadata !{i32 786453, metadata !152, metadata !24, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !26, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !26 = metadata !{null}
 !27 = metadata !{i32 786689, metadata !23, metadata !".block_descriptor", metadata !24, i32 16777825, metadata !28, i32 64, null} ; [ DW_TAG_arg_variable ]
 !28 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 0, i64 0, i32 0, metadata !29} ; [ DW_TAG_pointer_type ]
-!29 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"__block_literal_14", i32 609, i64 256, i64 32, i32 0, i32 0, i32 0, metadata !30, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!29 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"__block_literal_14", i32 609, i64 256, i64 32, i32 0, i32 0, null, metadata !30, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_literal_14] [line 609, size 256, align 32, offset 0] [def] [from ]
 !30 = metadata !{metadata !31, metadata !33, metadata !35, metadata !36, metadata !37, metadata !48, metadata !89, metadata !124}
 !31 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__isa", i32 609, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_member ]
 !32 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
@@ -133,7 +134,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !36 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__FuncPtr", i32 609, i64 32, i64 32, i64 96, i32 0, metadata !32} ; [ DW_TAG_member ]
 !37 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__descriptor", i32 609, i64 32, i64 32, i64 128, i32 0, metadata !38} ; [ DW_TAG_member ]
 !38 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !39} ; [ DW_TAG_pointer_type ]
-!39 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"__block_descriptor_withcopydispose", i32 307, i64 128, i64 32, i32 0, i32 0, i32 0, metadata !41, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!39 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"__block_descriptor_withcopydispose", i32 307, i64 128, i64 32, i32 0, i32 0, null, metadata !41, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [__block_descriptor_withcopydispose] [line 307, size 128, align 32, offset 0] [def] [from ]
 !40 = metadata !{i32 786473, metadata !153} ; [ DW_TAG_file_type ]
 !41 = metadata !{metadata !42, metadata !44, metadata !45, metadata !47}
 !42 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"reserved", i32 307, i64 32, i64 32, i64 0, i32 0, metadata !43} ; [ DW_TAG_member ]
@@ -144,7 +145,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !47 = metadata !{i32 786445, metadata !153, metadata !40, metadata !"DestroyFuncPtr", i32 307, i64 32, i64 32, i64 96, i32 0, metadata !46} ; [ DW_TAG_member ]
 !48 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"mydata", i32 609, i64 32, i64 32, i64 160, i32 0, metadata !49} ; [ DW_TAG_member ]
 !49 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 0, i64 0, i32 0, metadata !50} ; [ DW_TAG_pointer_type ]
-!50 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"", i32 0, i64 224, i64 0, i32 0, i32 16, i32 0, metadata !51, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!50 = metadata !{i32 786451, metadata !152, metadata !24, metadata !"", i32 0, i64 224, i64 0, i32 0, i32 16, null, metadata !51, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 0, size 224, align 0, offset 0] [def] [from ]
 !51 = metadata !{metadata !52, metadata !53, metadata !54, metadata !55, metadata !56, metadata !57, metadata !58}
 !52 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__isa", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_member ]
 !53 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__forwarding", i32 0, i64 32, i64 32, i64 32, i32 0, metadata !32} ; [ DW_TAG_member ]
@@ -154,17 +155,17 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !57 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"__destroy_helper", i32 0, i64 32, i64 32, i64 160, i32 0, metadata !32} ; [ DW_TAG_member ]
 !58 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"mydata", i32 0, i64 32, i64 32, i64 192, i32 0, metadata !59} ; [ DW_TAG_member ]
 !59 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !60} ; [ DW_TAG_pointer_type ]
-!60 = metadata !{i32 786451, metadata !154, metadata !24, metadata !"UIMydata", i32 26, i64 128, i64 32, i32 0, i32 0, i32 0, metadata !62, i32 16, i32 0} ; [ DW_TAG_structure_type ]
+!60 = metadata !{i32 786451, metadata !154, metadata !24, metadata !"UIMydata", i32 26, i64 128, i64 32, i32 0, i32 0, null, metadata !62, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [UIMydata] [line 26, size 128, align 32, offset 0] [def] [from ]
 !61 = metadata !{i32 786473, metadata !154} ; [ DW_TAG_file_type ]
 !62 = metadata !{metadata !63, metadata !71, metadata !75, metadata !79}
 !63 = metadata !{i32 786460, metadata !60, null, metadata !61, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
-!64 = metadata !{i32 786451, metadata !155, metadata !40, metadata !"NSO", i32 66, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !66, i32 16, i32 0} ; [ DW_TAG_structure_type ]
+!64 = metadata !{i32 786451, metadata !155, metadata !40, metadata !"NSO", i32 66, i64 32, i64 32, i32 0, i32 0, null, metadata !66, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [NSO] [line 66, size 32, align 32, offset 0] [def] [from ]
 !65 = metadata !{i32 786473, metadata !155} ; [ DW_TAG_file_type ]
 !66 = metadata !{metadata !67}
 !67 = metadata !{i32 786445, metadata !155, metadata !65, metadata !"isa", i32 67, i64 32, i64 32, i64 0, i32 2, metadata !68, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
 !68 = metadata !{i32 786454, metadata !153, metadata !0, metadata !"Class", i32 197, i64 0, i64 0, i64 0, i32 0, metadata !69} ; [ DW_TAG_typedef ]
 !69 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !70} ; [ DW_TAG_pointer_type ]
-!70 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, i32 0, null, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!70 = metadata !{i32 786451, metadata !153, metadata !0, metadata !"objc_class", i32 0, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [objc_class] [line 0, size 0, align 0, offset 0] [decl] [from ]
 !71 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"_mydataRef", i32 28, i64 32, i64 32, i64 32, i32 0, metadata !72, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
 !72 = metadata !{i32 786454, metadata !152, metadata !0, metadata !"CFTypeRef", i32 313, i64 0, i64 0, i64 0, i32 0, metadata !73} ; [ DW_TAG_typedef ]
 !73 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !74} ; [ DW_TAG_pointer_type ]
@@ -174,7 +175,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !77 = metadata !{i32 786473, metadata !156} ; [ DW_TAG_file_type ]
 !78 = metadata !{i32 786468, null, metadata !0, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !79 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"_mydataFlags", i32 37, i64 8, i64 8, i64 96, i32 0, metadata !80, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
-!80 = metadata !{i32 786451, metadata !154, metadata !0, metadata !"", i32 30, i64 8, i64 8, i32 0, i32 0, i32 0, metadata !81, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!80 = metadata !{i32 786451, metadata !154, metadata !0, metadata !"", i32 30, i64 8, i64 8, i32 0, i32 0, null, metadata !81, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [line 30, size 8, align 8, offset 0] [def] [from ]
 !81 = metadata !{metadata !82, metadata !84, metadata !85, metadata !86, metadata !87, metadata !88}
 !82 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"named", i32 31, i64 1, i64 32, i64 0, i32 0, metadata !83} ; [ DW_TAG_member ]
 !83 = metadata !{i32 786468, null, metadata !0, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
@@ -185,10 +186,10 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !88 = metadata !{i32 786445, metadata !154, metadata !61, metadata !"isCIMydata", i32 36, i64 1, i64 32, i64 7, i32 0, metadata !83} ; [ DW_TAG_member ]
 !89 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"self", i32 609, i64 32, i64 32, i64 192, i32 0, metadata !90} ; [ DW_TAG_member ]
 !90 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !91} ; [ DW_TAG_pointer_type ]
-!91 = metadata !{i32 786451, metadata !152, metadata !40, metadata !"MyWork", i32 36, i64 384, i64 32, i32 0, i32 0, i32 0, metadata !92, i32 16, i32 0} ; [ DW_TAG_structure_type ]
+!91 = metadata !{i32 786451, metadata !152, metadata !40, metadata !"MyWork", i32 36, i64 384, i64 32, i32 0, i32 0, null, metadata !92, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [MyWork] [line 36, size 384, align 32, offset 0] [def] [from ]
 !92 = metadata !{metadata !93, metadata !98, metadata !101, metadata !107, metadata !123}
 !93 = metadata !{i32 786460, metadata !152, metadata !91, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !94} ; [ DW_TAG_inheritance ]
-!94 = metadata !{i32 786451, metadata !157, metadata !40, metadata !"twork", i32 43, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !96, i32 16, i32 0} ; [ DW_TAG_structure_type ]
+!94 = metadata !{i32 786451, metadata !157, metadata !40, metadata !"twork", i32 43, i64 32, i64 32, i32 0, i32 0, null, metadata !96, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [twork] [line 43, size 32, align 32, offset 0] [def] [from ]
 !95 = metadata !{i32 786473, metadata !157} ; [ DW_TAG_file_type ]
 !96 = metadata !{metadata !97}
 !97 = metadata !{i32 786460, metadata !94, null, metadata !95, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
@@ -197,23 +198,23 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !100 = metadata !{i32 786468, null, metadata !0, metadata !"long long unsigned int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
 !101 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_library", i32 39, i64 32, i64 32, i64 96, i32 1, metadata !102, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
 !102 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !103} ; [ DW_TAG_pointer_type ]
-!103 = metadata !{i32 786451, metadata !158, metadata !40, metadata !"MyLibrary2", i32 22, i64 32, i64 32, i32 0, i32 0, i32 0, metadata !105, i32 16, i32 0} ; [ DW_TAG_structure_type ]
+!103 = metadata !{i32 786451, metadata !158, metadata !40, metadata !"MyLibrary2", i32 22, i64 32, i64 32, i32 0, i32 0, null, metadata !105, i32 16, null, null, null} ; [ DW_TAG_structure_type ] [MyLibrary2] [line 22, size 32, align 32, offset 0] [def] [from ]
 !104 = metadata !{i32 786473, metadata !158} ; [ DW_TAG_file_type ]
 !105 = metadata !{metadata !106}
 !106 = metadata !{i32 786460, metadata !103, null, metadata !104, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !64} ; [ DW_TAG_inheritance ]
 !107 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"_bounds", i32 40, i64 128, i64 32, i64 128, i32 1, metadata !108, metadata !"", metadata !"", metadata !"", i32 0} ; [ DW_TAG_member ]
 !108 = metadata !{i32 786454, metadata !153, metadata !0, metadata !"CR", i32 33, i64 0, i64 0, i64 0, i32 0, metadata !109} ; [ DW_TAG_typedef ]
-!109 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CR", i32 29, i64 128, i64 32, i32 0, i32 0, i32 0, metadata !110, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!109 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CR", i32 29, i64 128, i64 32, i32 0, i32 0, null, metadata !110, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [CR] [line 29, size 128, align 32, offset 0] [def] [from ]
 !110 = metadata !{metadata !111, metadata !117}
 !111 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"origin", i32 30, i64 64, i64 32, i64 0, i32 0, metadata !112} ; [ DW_TAG_member ]
 !112 = metadata !{i32 786454, metadata !156, metadata !0, metadata !"CP", i32 17, i64 0, i64 0, i64 0, i32 0, metadata !113} ; [ DW_TAG_typedef ]
-!113 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CP", i32 13, i64 64, i64 32, i32 0, i32 0, i32 0, metadata !114, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!113 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"CP", i32 13, i64 64, i64 32, i32 0, i32 0, null, metadata !114, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [CP] [line 13, size 64, align 32, offset 0] [def] [from ]
 !114 = metadata !{metadata !115, metadata !116}
 !115 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"x", i32 14, i64 32, i64 32, i64 0, i32 0, metadata !76} ; [ DW_TAG_member ]
 !116 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"y", i32 15, i64 32, i64 32, i64 32, i32 0, metadata !76} ; [ DW_TAG_member ]
 !117 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"size", i32 31, i64 64, i64 32, i64 64, i32 0, metadata !118} ; [ DW_TAG_member ]
 !118 = metadata !{i32 786454, metadata !156, metadata !0, metadata !"Size", i32 25, i64 0, i64 0, i64 0, i32 0, metadata !119} ; [ DW_TAG_typedef ]
-!119 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"Size", i32 21, i64 64, i64 32, i32 0, i32 0, i32 0, metadata !120, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!119 = metadata !{i32 786451, metadata !156, metadata !0, metadata !"Size", i32 21, i64 64, i64 32, i32 0, i32 0, null, metadata !120, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Size] [line 21, size 64, align 32, offset 0] [def] [from ]
 !120 = metadata !{metadata !121, metadata !122}
 !121 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"width", i32 22, i64 32, i64 32, i64 0, i32 0, metadata !76} ; [ DW_TAG_member ]
 !122 = metadata !{i32 786445, metadata !156, metadata !77, metadata !"height", i32 23, i64 32, i64 32, i64 32, i32 0, metadata !76} ; [ DW_TAG_member ]
@@ -221,7 +222,7 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !124 = metadata !{i32 786445, metadata !152, metadata !24, metadata !"semi", i32 609, i64 32, i64 32, i64 224, i32 0, metadata !125} ; [ DW_TAG_member ]
 !125 = metadata !{i32 786454, metadata !152, metadata !0, metadata !"d_t", i32 35, i64 0, i64 0, i64 0, i32 0, metadata !126} ; [ DW_TAG_typedef ]
 !126 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !127} ; [ DW_TAG_pointer_type ]
-!127 = metadata !{i32 786451, metadata !159, metadata !0, metadata !"my_struct", i32 49, i64 0, i64 0, i32 0, i32 4, i32 0, null, i32 0, i32 0} ; [ DW_TAG_structure_type ]
+!127 = metadata !{i32 786451, metadata !159, metadata !0, metadata !"my_struct", i32 49, i64 0, i64 0, i32 0, i32 4, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [my_struct] [line 49, size 0, align 0, offset 0] [decl] [from ]
 !128 = metadata !{i32 786473, metadata !159} ; [ DW_TAG_file_type ]
 !129 = metadata !{i32 609, i32 144, metadata !23, null}
 !130 = metadata !{i32 786689, metadata !23, metadata !"loadedMydata", metadata !24, i32 33555041, metadata !59, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -256,3 +257,4 @@ define hidden void @foobar_func_block_invoke_0(i8* %.block_descriptor, %0* %load
 !159 = metadata !{metadata !"header15.h", metadata !"/Volumes/Sandbox/llvm"}
 !160 = metadata !{metadata !"header.h", metadata !"/Volumes/Sandbox/llvm"}
 !161 = metadata !{metadata !"header2.h", metadata !"/Volumes/Sandbox/llvm"}
+!162 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-branch-folding.ll b/test/CodeGen/ARM/debug-info-branch-folding.ll
index 052fd22..8505f53 100644
--- a/test/CodeGen/ARM/debug-info-branch-folding.ll
+++ b/test/CodeGen/ARM/debug-info-branch-folding.ll
@@ -38,23 +38,25 @@ declare i32 @printf(i8* nocapture, ...) nounwind
 
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
+!llvm.module.flags = !{!56}
+
 !0 = metadata !{i32 786478, metadata !54, null, metadata !"test0001", metadata !"test0001", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, <4 x float> (float)* @test0001, null, null, metadata !51, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !54} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !54, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !17, metadata !17, metadata !50, null,  null, null} ; [ DW_TAG_compile_unit ]
 !3 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786454, metadata !54, metadata !2, metadata !"v4f32", i32 14, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!6 = metadata !{i32 786433, metadata !54, metadata !2, metadata !"", i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_vector_type ]
+!6 = metadata !{i32 786433, metadata !54, metadata !2, metadata !"", i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
 !7 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786465, i64 0, i64 4}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{i32 786478, metadata !54, null, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**, i1)* @main, null, null, metadata !52, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!10 = metadata !{i32 786478, metadata !54, null, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**, i1)* @main, null, null, metadata !52, i32 0} ; [ DW_TAG_subprogram ] [line 59] [def] [scope 0] [main]
+!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786478, metadata !55, null, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, null, null, null, metadata !53, i32 0} ; [ DW_TAG_subprogram ]
+!14 = metadata !{i32 786478, metadata !55, null, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !53, i32 0} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [scope 0] [printFV]
 !15 = metadata !{i32 786473, metadata !55} ; [ DW_TAG_file_type ]
-!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !17, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null}
 !18 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 16777219, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !19 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !1, i32 16777275, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -94,3 +96,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !53 = metadata !{metadata !30}
 !54 = metadata !{metadata !"build2.c", metadata !"/private/tmp"}
 !55 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp"}
+!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-d16-reg.ll b/test/CodeGen/ARM/debug-info-d16-reg.ll
index 11631ae..30a3e2d 100644
--- a/test/CodeGen/ARM/debug-info-d16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-d16-reg.ll
@@ -57,11 +57,12 @@ entry:
 declare i32 @puts(i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!48}
 
 !0 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"printer", metadata !"printer", metadata !"printer", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, double, i8)* @printer, null, null, metadata !43, i32 12} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !46} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !46, i32 1, metadata !"(LLVM build 00)", i1 true, metadata !"", i32 0, metadata !47, metadata !47, metadata !42, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !6, metadata !7, metadata !8}
 !5 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
@@ -69,7 +70,7 @@ declare i32 @puts(i8* nocapture) nounwind
 !8 = metadata !{i32 786468, metadata !46, metadata !1, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
 !9 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"inlineprinter", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, double, i8)* @inlineprinter, null, null, metadata !44, i32 5} ; [ DW_TAG_subprogram ]
 !10 = metadata !{i32 786478, metadata !46, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 18, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !45, i32 18} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!11 = metadata !{i32 786453, metadata !46, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !5, metadata !5, metadata !13}
 !13 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
 !14 = metadata !{i32 786447, metadata !46, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !15} ; [ DW_TAG_pointer_type ]
@@ -106,3 +107,4 @@ declare i32 @puts(i8* nocapture) nounwind
 !45 = metadata !{metadata !22, metadata !23, metadata !24}
 !46 = metadata !{metadata !"a.c", metadata !"/tmp/"}
 !47 = metadata !{i32 0}
+!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-qreg.ll b/test/CodeGen/ARM/debug-info-qreg.ll
index af61f6c..ee515fd5 100644
--- a/test/CodeGen/ARM/debug-info-qreg.ll
+++ b/test/CodeGen/ARM/debug-info-qreg.ll
@@ -36,24 +36,25 @@ declare i32 @printf(i8* nocapture, ...) nounwind
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!56}
 
-!0 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"test0001", metadata !"test0001", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, <4 x float> (float)* @test0001, null, null, metadata !51, i32 3} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"test0001", metadata !"test0001", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, <4 x float> (float)* @test0001, null, null, metadata !51, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [test0001]
 !1 = metadata !{i32 786473, metadata !54} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !54, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !17, metadata !17, metadata !50, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786454, metadata !54, metadata !2, metadata !"v4f32", i32 14, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
-!6 = metadata !{i32 786433, metadata !2, metadata !"", metadata !2, i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, i32 0} ; [ DW_TAG_vector_type ]
+!6 = metadata !{i32 786433, metadata !2, null, metadata !2, i32 0, i64 128, i64 128, i32 0, i32 0, metadata !7, metadata !8, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 128, align 128, offset 0] [from float]
 !7 = metadata !{i32 786468, null, metadata !2, metadata !"float", i32 0, i64 32, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786465, i64 0, i64 4}         ; [ DW_TAG_subrange_type ]
-!10 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !52, i32 59} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!10 = metadata !{i32 786478, metadata !54, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 59, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !52, i32 59} ; [ DW_TAG_subprogram ] [line 59] [def] [main]
+!11 = metadata !{i32 786453, metadata !54, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786478, metadata !55, metadata !15, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, null, null, null, metadata !53, i32 41} ; [ DW_TAG_subprogram ]
+!14 = metadata !{i32 786478, metadata !55, metadata !15, metadata !"printFV", metadata !"printFV", metadata !"", i32 41, metadata !16, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !53, i32 41} ; [ DW_TAG_subprogram ] [line 41] [local] [def] [printFV]
 !15 = metadata !{i32 786473, metadata !55} ; [ DW_TAG_file_type ]
-!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !17, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!16 = metadata !{i32 786453, metadata !55, metadata !15, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !17, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !17 = metadata !{null}
 !18 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 16777219, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !19 = metadata !{i32 786689, metadata !10, metadata !"argc", metadata !1, i32 16777275, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -70,7 +71,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !30 = metadata !{i32 786689, metadata !14, metadata !"F", metadata !15, i32 16777257, metadata !31, i32 0, null} ; [ DW_TAG_arg_variable ]
 !31 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !32} ; [ DW_TAG_pointer_type ]
 !32 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"FV", i32 25, i64 0, i64 0, i64 0, i32 0, metadata !33} ; [ DW_TAG_typedef ]
-!33 = metadata !{i32 786455, metadata !55, metadata !2, metadata !"", i32 22, i64 128, i64 128, i64 0, i32 0, i32 0, metadata !34, i32 0, i32 0} ; [ DW_TAG_union_type ]
+!33 = metadata !{i32 786455, metadata !55, metadata !2, metadata !"", i32 22, i64 128, i64 128, i64 0, i32 0, i32 0, metadata !34, i32 0, null} ; [ DW_TAG_union_type ]
 !34 = metadata !{metadata !35, metadata !37}
 !35 = metadata !{i32 786445, metadata !55, metadata !15, metadata !"V", i32 23, i64 128, i64 128, i64 0, i32 0, metadata !36} ; [ DW_TAG_member ]
 !36 = metadata !{i32 786454, metadata !55, metadata !2, metadata !"v4sf", i32 3, i64 0, i64 0, i64 0, i32 0, metadata !6} ; [ DW_TAG_typedef ]
@@ -93,3 +94,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !53 = metadata !{metadata !30}
 !54 = metadata !{metadata !"build2.c", metadata !"/private/tmp"}
 !55 = metadata !{metadata !"/Volumes/Lalgate/work/llvm/projects/llvm-test/SingleSource/UnitTests/Vector/helpers.h", metadata !"/private/tmp"}
+!56 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll
index 83e7dac..e92d977 100644
--- a/test/CodeGen/ARM/debug-info-s16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-s16-reg.ll
@@ -62,15 +62,16 @@ declare i32 @puts(i8* nocapture) nounwind optsize
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!53}
 
-!0 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i8*, float, i8)* @inlineprinter, null, null, metadata !48, i32 5} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"inlineprinter", metadata !"inlineprinter", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, float, i8)* @inlineprinter, null, null, metadata !48, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [inlineprinter]
 !1 = metadata !{i32 786473, metadata !51} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !51, i32 12, metadata !"clang version 3.0 (trunk 129915)", i1 true, metadata !"", i32 0, metadata !52, metadata !52, metadata !47, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !51, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !51, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"printer", metadata !"printer", metadata !"", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i8*, float, i8)* @printer, null, null, metadata !49, i32 12} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !50, i32 18} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"printer", metadata !"printer", metadata !"", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i8*, float, i8)* @printer, null, null, metadata !49, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [printer]
+!7 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 18, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !50, i32 18} ; [ DW_TAG_subprogram ] [line 18] [def] [main]
 !8 = metadata !{i32 786689, metadata !0, metadata !"ptr", metadata !1, i32 16777220, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
 !9 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
 !10 = metadata !{i32 786689, metadata !0, metadata !"val", metadata !1, i32 33554436, metadata !11, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -116,3 +117,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !50 = metadata !{metadata !17, metadata !18, metadata !22}
 !51 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !52 = metadata !{i32 0}
+!53 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
index cc2e831..854fcab 100644
--- a/test/CodeGen/ARM/debug-info-sreg2.ll
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll
@@ -40,11 +40,12 @@ declare float @_Z2f3f(float) optsize
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!20}
 
 !0 = metadata !{i32 786449, metadata !18, i32 4, metadata !"clang version 3.0 (trunk 130845)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, metadata !16, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, void ()* @_Z3foov, null, null, metadata !17, i32 5} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3foov", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @_Z3foov, null, null, metadata !17, i32 5} ; [ DW_TAG_subprogram ] [line 5] [def] [foo]
 !2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 786688, metadata !6, metadata !"k", metadata !2, i32 6, metadata !7, i32 0, null} ; [ DW_TAG_auto_variable ]
 !6 = metadata !{i32 786443, metadata !18, metadata !1, i32 5, i32 12, i32 0} ; [ DW_TAG_lexical_block ]
@@ -61,3 +62,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !17 = metadata !{metadata !5, metadata !8}
 !18 = metadata !{metadata !"k.cc", metadata !"/private/tmp"}
 !19 = metadata !{i32 0}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
index 06d6172..7be0c79 100644
--- a/test/CodeGen/ARM/divmod.ll
+++ b/test/CodeGen/ARM/divmod.ll
@@ -60,7 +60,7 @@ bb:
   %3 = load i32* @tabsize, align 4
   %4 = srem i32 %cols, %3
   %5 = sdiv i32 %cols, %3
-  %6 = tail call i32 @llvm.objectsize.i32(i8* null, i1 false)
+  %6 = tail call i32 @llvm.objectsize.i32.p0i8(i8* null, i1 false)
   %7 = tail call i8* @__memset_chk(i8* null, i32 9, i32 %5, i32 %6) nounwind
   br label %bb1
 
@@ -71,7 +71,7 @@ bb1:
   ret void
 }
 
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readnone
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readnone
 declare i8* @__memset_chk(i8*, i32, i32, i32) nounwind
 
 ; rdar://11714607
diff --git a/test/CodeGen/ARM/fast-isel-align.ll b/test/CodeGen/ARM/fast-isel-align.ll
index 4e28a10..9c9a188 100644
--- a/test/CodeGen/ARM/fast-isel-align.ll
+++ b/test/CodeGen/ARM/fast-isel-align.ll
@@ -1,22 +1,22 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
-
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
-
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-
-; RUN: llc < %s -O0  -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0  -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
-; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
-; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-nacl -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+
+; RUN: llc < %s -O0  -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0  -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -arm-no-strict-align -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=armv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -relocation-model=dynamic-no-pic -mtriple=thumbv7-unknown-unknown -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 
 ; Check unaligned stores
 %struct.anon = type <{ float }>
diff --git a/test/CodeGen/ARM/fast-isel-binary.ll b/test/CodeGen/ARM/fast-isel-binary.ll
index 3159627..e1a2a4f 100644
--- a/test/CodeGen/ARM/fast-isel-binary.ll
+++ b/test/CodeGen/ARM/fast-isel-binary.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -verify-machineinstrs -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
 
 ; Test add with non-legal types
 
diff --git a/test/CodeGen/ARM/fast-isel-br-phi.ll b/test/CodeGen/ARM/fast-isel-br-phi.ll
index a0aba69..3b9d465 100644
--- a/test/CodeGen/ARM/fast-isel-br-phi.ll
+++ b/test/CodeGen/ARM/fast-isel-br-phi.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios
 
 ; This test ensures HandlePHINodesInSuccessorBlocks() is able to promote basic
 ; non-legal integer types (i.e., i1, i8, i16).
diff --git a/test/CodeGen/ARM/fast-isel-call.ll b/test/CodeGen/ARM/fast-isel-call.ll
index d10a381..917a15d 100644
--- a/test/CodeGen/ARM/fast-isel-call.ll
+++ b/test/CodeGen/ARM/fast-isel-call.ll
@@ -1,12 +1,14 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=THUMB-NOVFP
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=-vfp2 | FileCheck %s --check-prefix=ARM-NOVFP
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=-vfp2 | FileCheck %s --check-prefix=THUMB-NOVFP
+
+; XFAIL: vg_leak
 
 ; Note that some of these tests assume that relocations are either
 ; movw/movt or constant pool loads. Different platforms will select
diff --git a/test/CodeGen/ARM/fast-isel-cmp-imm.ll b/test/CodeGen/ARM/fast-isel-cmp-imm.ll
index 45ef4ed..55baf48 100644
--- a/test/CodeGen/ARM/fast-isel-cmp-imm.ll
+++ b/test/CodeGen/ARM/fast-isel-cmp-imm.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
 
 define void @t1a(float %a) uwtable ssp {
 entry:
diff --git a/test/CodeGen/ARM/fast-isel-conversion.ll b/test/CodeGen/ARM/fast-isel-conversion.ll
index e40891a..5983493 100644
--- a/test/CodeGen/ARM/fast-isel-conversion.ll
+++ b/test/CodeGen/ARM/fast-isel-conversion.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -verify-machineinstrs -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -verify-machineinstrs -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -verify-machineinstrs -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
 
 ; Test sitofp
 
diff --git a/test/CodeGen/ARM/fast-isel-ext.ll b/test/CodeGen/ARM/fast-isel-ext.ll
index 15d0d3c..de0dd19 100644
--- a/test/CodeGen/ARM/fast-isel-ext.ll
+++ b/test/CodeGen/ARM/fast-isel-ext.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=v7
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=v7
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv4t-apple-ios | FileCheck %s --check-prefix=prev6
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv4t-linux-gnueabi | FileCheck %s --check-prefix=prev6
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv5-apple-ios | FileCheck %s --check-prefix=prev6
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv5-linux-gnueabi | FileCheck %s --check-prefix=prev6
-; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=v7
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=v7
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=v7
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv4t-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=prev6
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv4t-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=prev6
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv5-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=prev6
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=armv5-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=prev6
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=v7
 
 ; Can't test pre-ARMv6 Thumb because ARM FastISel currently only supports
 ; Thumb2. The ARMFastISel::ARMEmitIntExt code should work for Thumb by always
diff --git a/test/CodeGen/ARM/fast-isel-icmp.ll b/test/CodeGen/ARM/fast-isel-icmp.ll
index 3dc1109..85f449e 100644
--- a/test/CodeGen/ARM/fast-isel-icmp.ll
+++ b/test/CodeGen/ARM/fast-isel-icmp.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
 
 define i32 @icmp_i16_signed(i16 %a, i16 %b) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/fast-isel-intrinsic.ll b/test/CodeGen/ARM/fast-isel-intrinsic.ll
index 572ac3a..b08b72b 100644
--- a/test/CodeGen/ARM/fast-isel-intrinsic.ll
+++ b/test/CodeGen/ARM/fast-isel-intrinsic.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls | FileCheck %s --check-prefix=ARM-LONG
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls | FileCheck %s --check-prefix=THUMB-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=ARM-LONG
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -arm-long-calls -verify-machineinstrs | FileCheck %s --check-prefix=THUMB-LONG
+
+; XFAIL: vg_leak
 
 ; Note that some of these tests assume that relocations are either
 ; movw/movt or constant pool loads. Different platforms will select
diff --git a/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll b/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll
index 2a88678..d9c9cc4 100644
--- a/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll
+++ b/test/CodeGen/ARM/fast-isel-ldr-str-thumb-neg-index.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
 
 define i32 @t1(i32* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t1
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -1
-  %0 = load i32* %add.ptr, align 4, !tbaa !0
+  %0 = load i32* %add.ptr, align 4
 ; THUMB: ldr r{{[0-9]}}, [r0, #-4]
   ret i32 %0
 }
@@ -13,7 +13,7 @@ define i32 @t2(i32* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t2
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -63
-  %0 = load i32* %add.ptr, align 4, !tbaa !0
+  %0 = load i32* %add.ptr, align 4
 ; THUMB: ldr r{{[0-9]}}, [r0, #-252]
   ret i32 %0
 }
@@ -22,7 +22,7 @@ define i32 @t3(i32* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t3
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -64
-  %0 = load i32* %add.ptr, align 4, !tbaa !0
+  %0 = load i32* %add.ptr, align 4
 ; THUMB: ldr r{{[0-9]}}, [r0]
   ret i32 %0
 }
@@ -31,7 +31,7 @@ define zeroext i16 @t4(i16* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t4
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -1
-  %0 = load i16* %add.ptr, align 2, !tbaa !3
+  %0 = load i16* %add.ptr, align 2
 ; THUMB: ldrh r{{[0-9]}}, [r0, #-2]
   ret i16 %0
 }
@@ -40,7 +40,7 @@ define zeroext i16 @t5(i16* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t5
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -127
-  %0 = load i16* %add.ptr, align 2, !tbaa !3
+  %0 = load i16* %add.ptr, align 2
 ; THUMB: ldrh r{{[0-9]}}, [r0, #-254]
   ret i16 %0
 }
@@ -49,7 +49,7 @@ define zeroext i16 @t6(i16* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t6
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -128
-  %0 = load i16* %add.ptr, align 2, !tbaa !3
+  %0 = load i16* %add.ptr, align 2
 ; THUMB: ldrh r{{[0-9]}}, [r0]
   ret i16 %0
 }
@@ -58,7 +58,7 @@ define zeroext i8 @t7(i8* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t7
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -1
-  %0 = load i8* %add.ptr, align 1, !tbaa !1
+  %0 = load i8* %add.ptr, align 1
 ; THUMB: ldrb r{{[0-9]}}, [r0, #-1]
   ret i8 %0
 }
@@ -67,7 +67,7 @@ define zeroext i8 @t8(i8* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t8
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -255
-  %0 = load i8* %add.ptr, align 1, !tbaa !1
+  %0 = load i8* %add.ptr, align 1
 ; THUMB: ldrb r{{[0-9]}}, [r0, #-255]
   ret i8 %0
 }
@@ -76,7 +76,7 @@ define zeroext i8 @t9(i8* nocapture %ptr) nounwind readonly {
 entry:
 ; THUMB: t9
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -256
-  %0 = load i8* %add.ptr, align 1, !tbaa !1
+  %0 = load i8* %add.ptr, align 1
 ; THUMB: ldrb r{{[0-9]}}, [r0]
   ret i8 %0
 }
@@ -85,7 +85,7 @@ define void @t10(i32* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t10
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -1
-  store i32 0, i32* %add.ptr, align 4, !tbaa !0
+  store i32 0, i32* %add.ptr, align 4
 ; THUMB: str r{{[0-9]}}, [r0, #-4]
   ret void
 }
@@ -94,7 +94,7 @@ define void @t11(i32* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t11
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -63
-  store i32 0, i32* %add.ptr, align 4, !tbaa !0
+  store i32 0, i32* %add.ptr, align 4
 ; THUMB: str r{{[0-9]}}, [r0, #-252]
   ret void
 }
@@ -103,7 +103,7 @@ define void @t12(i32* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t12
   %add.ptr = getelementptr inbounds i32* %ptr, i32 -64
-  store i32 0, i32* %add.ptr, align 4, !tbaa !0
+  store i32 0, i32* %add.ptr, align 4
 ; THUMB: str r{{[0-9]}}, [r0]
   ret void
 }
@@ -112,7 +112,7 @@ define void @t13(i16* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t13
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -1
-  store i16 0, i16* %add.ptr, align 2, !tbaa !3
+  store i16 0, i16* %add.ptr, align 2
 ; THUMB: strh r{{[0-9]}}, [r0, #-2]
   ret void
 }
@@ -121,7 +121,7 @@ define void @t14(i16* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t14
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -127
-  store i16 0, i16* %add.ptr, align 2, !tbaa !3
+  store i16 0, i16* %add.ptr, align 2
 ; THUMB: strh r{{[0-9]}}, [r0, #-254]
   ret void
 }
@@ -130,7 +130,7 @@ define void @t15(i16* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t15
   %add.ptr = getelementptr inbounds i16* %ptr, i32 -128
-  store i16 0, i16* %add.ptr, align 2, !tbaa !3
+  store i16 0, i16* %add.ptr, align 2
 ; THUMB: strh r{{[0-9]}}, [r0]
   ret void
 }
@@ -139,7 +139,7 @@ define void @t16(i8* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t16
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -1
-  store i8 0, i8* %add.ptr, align 1, !tbaa !1
+  store i8 0, i8* %add.ptr, align 1
 ; THUMB: strb r{{[0-9]}}, [r0, #-1]
   ret void
 }
@@ -148,7 +148,7 @@ define void @t17(i8* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t17
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -255
-  store i8 0, i8* %add.ptr, align 1, !tbaa !1
+  store i8 0, i8* %add.ptr, align 1
 ; THUMB: strb r{{[0-9]}}, [r0, #-255]
   ret void
 }
@@ -157,12 +157,7 @@ define void @t18(i8* nocapture %ptr) nounwind {
 entry:
 ; THUMB: t18
   %add.ptr = getelementptr inbounds i8* %ptr, i32 -256
-  store i8 0, i8* %add.ptr, align 1, !tbaa !1
+  store i8 0, i8* %add.ptr, align 1
 ; THUMB: strb r{{[0-9]}}, [r0]
   ret void
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/ARM/fast-isel-pic.ll b/test/CodeGen/ARM/fast-isel-pic.ll
index ad0f159..838c103 100644
--- a/test/CodeGen/ARM/fast-isel-pic.ll
+++ b/test/CodeGen/ARM/fast-isel-pic.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=pic -mtriple=arm-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARMv7
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=pic -mtriple=thumbv7-none-linux-gnueabi | FileCheck %s --check-prefix=THUMB-ELF
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=armv7-none-linux-gnueabi | FileCheck %s --check-prefix=ARMv7-ELF
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=pic -mtriple=armv7-none-linux-gnueabi | FileCheck %s --check-prefix=ARMv7-ELF
 
 @g = global i32 0, align 4
 
@@ -25,6 +25,8 @@ entry:
 ; ARMv7: add  [[reg2]], pc, [[reg2]]
 ; ARMv7-ELF: LoadGV
 ; ARMv7-ELF: ldr r[[reg2:[0-9]+]],
+; ARMv7-ELF: .LPC
+; ARMv7-ELF-NEXT: add r[[reg2]], pc
 ; ARMv7-ELF: ldr r[[reg3:[0-9]+]],
 ; ARMv7-ELF: ldr r[[reg2]], [r[[reg3]], r[[reg2]]]
   %tmp = load i32* @g
@@ -54,6 +56,8 @@ entry:
 ; ARMv7: ldr  r[[reg5]], [r[[reg5]]]
 ; ARMv7-ELF: LoadIndirectSymbol
 ; ARMv7-ELF: ldr r[[reg5:[0-9]+]],
+; ARMv7-ELF: .LPC
+; ARMv7-ELF-NEXT: add r[[reg5]], pc
 ; ARMv7-ELF: ldr r[[reg6:[0-9]+]],
 ; ARMv7-ELF: ldr r[[reg5]], [r[[reg6]], r[[reg5]]]
   %tmp = load i32* @i
diff --git a/test/CodeGen/ARM/fast-isel-ret.ll b/test/CodeGen/ARM/fast-isel-ret.ll
index ba5412c..8a68309 100644
--- a/test/CodeGen/ARM/fast-isel-ret.ll
+++ b/test/CodeGen/ARM/fast-isel-ret.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s
 
 ; Sign-extend of i1 currently not supported by fast-isel
 ;define signext i1 @ret0(i1 signext %a) nounwind uwtable ssp {
diff --git a/test/CodeGen/ARM/fast-isel-select.ll b/test/CodeGen/ARM/fast-isel-select.ll
index bb88814..40f8807 100644
--- a/test/CodeGen/ARM/fast-isel-select.ll
+++ b/test/CodeGen/ARM/fast-isel-select.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv8-apple-ios | FileCheck %s --check-prefix=THUMB
 
 define i32 @t1(i1 %c) nounwind readnone {
 entry:
@@ -39,15 +40,16 @@ define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
 entry:
 ; ARM: t3
 ; ARM: cmp r0, #0
-; ARM: movne r{{[1-9]}}, r{{[1-9]}}
-; ARM: mov r0, r{{[1-9]}}
+; ARM: movne r2, r1
+; ARM: add r0, r2, r1
 ; THUMB: t3
 ; THUMB: cmp r0, #0
 ; THUMB: it ne
-; THUMB: movne r{{[1-9]}}, r{{[1-9]}}
-; THUMB: mov r0, r{{[1-9]}}
+; THUMB: movne r2, r1
+; THUMB: add.w r0, r2, r1
   %0 = select i1 %c, i32 %a, i32 %b
-  ret i32 %0
+  %1 = add i32 %0, %a
+  ret i32 %1
 }
 
 define i32 @t4(i1 %c) nounwind readnone {
diff --git a/test/CodeGen/ARM/fast-isel-shifter.ll b/test/CodeGen/ARM/fast-isel-shifter.ll
index dbb1ce2..eb4b2b2 100644
--- a/test/CodeGen/ARM/fast-isel-shifter.ll
+++ b/test/CodeGen/ARM/fast-isel-shifter.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
 
 define i32 @shl() nounwind ssp {
 entry:
diff --git a/test/CodeGen/ARM/fast-isel-static.ll b/test/CodeGen/ARM/fast-isel-static.ll
index 7d86cb9..93c14a0 100644
--- a/test/CodeGen/ARM/fast-isel-static.ll
+++ b/test/CodeGen/ARM/fast-isel-static.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=LONG %s
-; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=LONG %s
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=NORM %s
-; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=NORM %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static -arm-long-calls | FileCheck -check-prefix=CHECK-LONG %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
+; RUN: llc < %s -mtriple=thumbv7-linux-gnueabi -O0 -verify-machineinstrs -fast-isel-abort -relocation-model=static | FileCheck -check-prefix=CHECK-NORM %s
 
 define void @myadd(float* %sum, float* %addend) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index 0cebc90..5981cab 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=ARM
-; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -verify-machineinstrs | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -verify-machineinstrs | FileCheck %s --check-prefix=THUMB
 
 ; Very basic fast-isel functionality.
 define i32 @test0(i32 %a, i32 %b) nounwind {
diff --git a/test/CodeGen/ARM/fastisel-gep-promote-before-add.ll b/test/CodeGen/ARM/fastisel-gep-promote-before-add.ll
new file mode 100644
index 0000000..a32ab6d
--- /dev/null
+++ b/test/CodeGen/ARM/fastisel-gep-promote-before-add.ll
@@ -0,0 +1,18 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=armv7-apple-ios %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %add = add i8 64, 64 ; 0x40 + 0x40
+  %0 = load i8** %ptr.addr, align 8
+
+  ; CHECK-LABEL: _gep_promotion:
+  ; CHECK: ldrb {{r[0-9]+}}, {{\[r[0-9]+\]}}
+  %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
+
diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll
new file mode 100644
index 0000000..67fd129
--- /dev/null
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -0,0 +1,164 @@
+; RUN: llc -mtriple=thumbv7-apple-darwin-eabi < %s | FileCheck %s
+; RUN: llc -mtriple=thumbv6m-apple-darwin-eabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1
+; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS
+
+
+declare void @bar(i8*)
+
+%bigVec = type [2 x double]
+
+@var = global %bigVec zeroinitializer
+
+define void @check_simple() minsize {
+; CHECK-LABEL: check_simple:
+; CHECK: push.w {r7, r8, r9, r10, r11, lr}
+; CHECK-NOT: sub sp, sp,
+; ...
+; CHECK-NOT: add sp, sp,
+; CHECK: pop.w {r0, r1, r2, r3, r11, pc}
+
+; CHECK-T1-LABEL: check_simple:
+; CHECK-T1: push {r3, r4, r5, r6, r7, lr}
+; CHECK-T1: add r7, sp, #16
+; CHECK-T1-NOT: sub sp, sp,
+; ...
+; CHECK-T1-NOT: add sp, sp,
+; CHECK-T1: pop {r0, r1, r2, r3, r7, pc}
+
+  ; iOS always has a frame pointer and messing with the push affects
+  ; how it's set in the prologue. Make sure we get that right.
+; CHECK-IOS-LABEL: check_simple:
+; CHECK-IOS: push {r3, r4, r5, r6, r7, lr}
+; CHECK-NOT: sub sp,
+; CHECK-IOS: add r7, sp, #16
+; CHECK-NOT: sub sp,
+; ...
+; CHECK-NOT: add sp,
+; CHEC: pop {r3, r4, r5, r6, r7, pc}
+
+  %var = alloca i8, i32 16
+  call void @bar(i8* %var)
+  ret void
+}
+
+define void @check_simple_too_big() minsize {
+; CHECK-LABEL: check_simple_too_big:
+; CHECK: push.w {r11, lr}
+; CHECK: sub sp,
+; ...
+; CHECK: add sp,
+; CHECK: pop.w {r11, pc}
+  %var = alloca i8, i32 64
+  call void @bar(i8* %var)
+  ret void
+}
+
+define void @check_vfp_fold() minsize {
+; CHECK-LABEL: check_vfp_fold:
+; CHECK: push {r[[GLOBREG:[0-9]+]], lr}
+; CHECK: vpush {d6, d7, d8, d9}
+; CHECK-NOT: sub sp,
+; ...
+; CHECK: vldmia r[[GLOBREG]], {d8, d9}
+; ...
+; CHECK-NOT: add sp,
+; CHECK: vpop {d6, d7, d8, d9}
+; CHECKL pop {r[[GLOBREG]], pc}
+
+  ; iOS uses aligned NEON stores here, which is convenient since we
+  ; want to make sure that works too.
+; CHECK-IOS-LABEL: check_vfp_fold:
+; CHECK-IOS: push {r0, r1, r2, r3, r4, r7, lr}
+; CHECK-IOS: sub.w r4, sp, #16
+; CHECK-IOS: bic r4, r4, #15
+; CHECK-IOS: mov sp, r4
+; CHECK-IOS: vst1.64 {d8, d9}, [r4:128]
+; ...
+; CHECK-IOS: add r4, sp, #16
+; CHECK-IOS: vld1.64 {d8, d9}, [r4:128]
+; CHECK-IOS: mov sp, r4
+; CHECK-IOS: pop {r4, r7, pc}
+
+  %var = alloca i8, i32 16
+
+  %tmp = load %bigVec* @var
+  call void @bar(i8* %var)
+  store %bigVec %tmp, %bigVec* @var
+
+  ret void
+}
+
+; This function should use just enough space that the "add sp, sp, ..." could be
+; folded in except that doing so would clobber the value being returned.
+define i64 @check_no_return_clobber() minsize {
+; CHECK-LABEL: check_no_return_clobber:
+; CHECK: push.w {r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NOT: sub sp,
+; ...
+; CHECK: add sp, #40
+; CHECK: pop.w {r11, pc}
+
+  ; Just to keep iOS FileCheck within previous function:
+; CHECK-IOS-LABEL: check_no_return_clobber:
+
+  %var = alloca i8, i32 40
+  call void @bar(i8* %var)
+  ret i64 0
+}
+
+define arm_aapcs_vfpcc double @check_vfp_no_return_clobber() minsize {
+; CHECK-LABEL: check_vfp_no_return_clobber:
+; CHECK: push {r[[GLOBREG:[0-9]+]], lr}
+; CHECK: vpush {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9}
+; CHECK-NOT: sub sp,
+; ...
+; CHECK: add sp, #64
+; CHECK: vpop {d8, d9}
+; CHECK: pop {r[[GLOBREG]], pc}
+
+  %var = alloca i8, i32 64
+
+  %tmp = load %bigVec* @var
+  call void @bar(i8* %var)
+  store %bigVec %tmp, %bigVec* @var
+
+  ret double 1.0
+}
+
+@dbl = global double 0.0
+
+; PR18136: there was a bug determining where the first eligible pop in a
+; basic-block was when the entire block was epilogue code.
+define void @test_fold_point(i1 %tst) minsize {
+; CHECK-LABEL: test_fold_point:
+
+  ; Important to check for beginning of basic block, because if it gets
+  ; if-converted the test is probably no longer checking what it should.
+; CHECK: {{LBB[0-9]+_2}}:
+; CHECK-NEXT: vpop {d7, d8}
+; CHECK-NEXT: pop {r4, pc}
+
+  ; With a guaranteed frame-pointer, we want to make sure that its offset in the
+  ; push block is correct, even if a few registers have been tacked onto a later
+  ; vpush (PR18160).
+; CHECK-IOS-LABEL: test_fold_point:
+; CHECK-IOS: push {r4, r7, lr}
+; CHECK-IOS-NEXT: add r7, sp, #4
+; CHECK-IOS-NEXT: vpush {d7, d8}
+
+  ; We want some memory so there's a stack adjustment to fold...
+  %var = alloca i8, i32 8
+
+  ; We want a long-lived floating register so that a callee-saved dN is used and
+  ; there's both a vpop and a pop.
+  %live_val = load double* @dbl
+  br i1 %tst, label %true, label %end
+true:
+  call void @bar(i8* %var)
+  store double %live_val, double* @dbl
+  br label %end
+end:
+  ; We want the epilogue to be the only thing in a basic block so that we hit
+  ; the correct edge-case (first inst in block is correct one to adjust).
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/ifconv-kills.ll b/test/CodeGen/ARM/ifconv-kills.ll
new file mode 100644
index 0000000..bf54ba2
--- /dev/null
+++ b/test/CodeGen/ARM/ifconv-kills.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -march arm -mcpu swift -verify-machineinstrs
+
+declare i32 @f(i32 %p0, i32 %p1)
+
+define i32 @foo(i32* %ptr) {
+entry:
+  %cmp = icmp ne i32* %ptr, null
+  br i1 %cmp, label %if.then, label %if.else
+
+; present something which can be easily if-converted
+if.then:
+  ; %R0 should be killed here
+  %valt = load i32* %ptr, align 4
+  br label %return
+
+if.else:
+  ; %R0 should be killed here, however after if-conversion the %R0 kill
+  ; has to be removed because if.then will follow after this and still
+  ; read it.
+  %addr = getelementptr inbounds i32* %ptr, i32 4
+  %vale = load i32* %addr, align 4
+  br label %return
+
+return:
+  %phival = phi i32 [ %valt, %if.then ], [ %vale, %if.else ]
+  ; suggest to bring %phival/%valt/%vale into %R1 (because otherwise there
+  ; will be no kills in if.then/if.else)
+  %retval = call i32 @f (i32 0, i32 %phival)
+  ret i32 %retval
+}
diff --git a/test/CodeGen/ARM/ifconv-regmask.ll b/test/CodeGen/ARM/ifconv-regmask.ll
new file mode 100644
index 0000000..d45f65f
--- /dev/null
+++ b/test/CodeGen/ARM/ifconv-regmask.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -mtriple=thumbv7s-apple-ios6.0.0 -verify-machineinstrs
+
+%union.opcode = type { i32 }
+
+@opcode = external global %union.opcode, align 4
+
+; Function Attrs: nounwind ssp
+define i32 @sfu() {
+entry:
+  %bf.load = load i32* getelementptr inbounds (%union.opcode* @opcode, i32 0, i32 0), align 4
+  %bf.lshr = lshr i32 %bf.load, 26
+  %bf.clear = and i32 %bf.lshr, 7
+  switch i32 %bf.clear, label %return [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+  ]
+
+sw.bb:                                            ; preds = %entry
+  %call = tail call i32 @func0()
+  br label %return
+
+sw.bb1:                                           ; preds = %entry
+  %call2 = tail call i32 @func1()
+  br label %return
+
+return:                                           ; preds = %sw.bb1, %sw.bb, %entry
+  %retval.0 = phi i32 [ %call2, %sw.bb1 ], [ %call, %sw.bb ], [ -1, %entry ]
+  ret i32 %retval.0
+}
+
+; Function Attrs: nounwind ssp
+declare i32 @func0()
+
+; Function Attrs: nounwind ssp
+declare i32 @func1()
diff --git a/test/CodeGen/ARM/indirectbr.ll b/test/CodeGen/ARM/indirectbr.ll
index 99e84a6..1aeeb91 100644
--- a/test/CodeGen/ARM/indirectbr.ll
+++ b/test/CodeGen/ARM/indirectbr.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -relocation-model=pic -mtriple=armv6-apple-darwin | FileCheck %s -check-prefix=ARM
 ; RUN: llc < %s -relocation-model=pic -mtriple=thumbv6-apple-darwin | FileCheck %s -check-prefix=THUMB
 ; RUN: llc < %s -relocation-model=static -mtriple=thumbv7-apple-darwin | FileCheck %s -check-prefix=THUMB2
+; RUN: llc < %s -relocation-model=static -mtriple=thumbv8-apple-darwin | FileCheck %s -check-prefix=THUMB2
 
 @nextaddr = global i8* null                       ; <i8**> [#uses=2]
 @C.0.2070 = private constant [5 x i8*] [i8* blockaddress(@foo, %L1), i8* blockaddress(@foo, %L2), i8* blockaddress(@foo, %L3), i8* blockaddress(@foo, %L4), i8* blockaddress(@foo, %L5)] ; <[5 x i8*]*> [#uses=1]
@@ -48,14 +49,17 @@ L2:                                               ; preds = %L3, %bb2
 
 L1:                                               ; preds = %L2, %bb2
   %res.3 = phi i32 [ %phitmp, %L2 ], [ 2, %bb2 ]  ; <i32> [#uses=1]
+; ARM-LABEL: %L1
 ; ARM: ldr [[R1:r[0-9]+]], LCPI
 ; ARM: add [[R1b:r[0-9]+]], pc, [[R1]]
 ; ARM: str [[R1b]]
+; THUMB-LABEL: %L1
 ; THUMB: ldr
 ; THUMB: add
 ; THUMB: ldr [[R2:r[0-9]+]], LCPI
 ; THUMB: add [[R2]], pc
 ; THUMB: str [[R2]]
+; THUMB2-LABEL: %L1
 ; THUMB2: ldr [[R2:r[0-9]+]], LCPI
 ; THUMB2-NEXT: str{{(.w)?}} [[R2]]
   store i8* blockaddress(@foo, %L5), i8** @nextaddr, align 4
diff --git a/test/CodeGen/ARM/inlineasm-64bit.ll b/test/CodeGen/ARM/inlineasm-64bit.ll
index b23db10..683a0c4 100644
--- a/test/CodeGen/ARM/inlineasm-64bit.ll
+++ b/test/CodeGen/ARM/inlineasm-64bit.ll
@@ -85,3 +85,22 @@ define void @strd_test(i64* %p, i32 %lo, i32 %hi) nounwind {
   tail call void asm sideeffect "strd $0, ${0:H}, [$1]", "r,r"(i64 %4, i64* %p) nounwind
   ret void
 }
+
+; Make sure we don't untie operands by mistake.
+define i64 @tied_64bit_test(i64 %in) nounwind {
+; CHECK-LABEL: tied_64bit_test:
+; CHECK: OUT([[OUTREG:r[0-9]+]]), IN([[OUTREG]])
+  %addr = alloca i64
+  call void asm "OUT($0), IN($1)", "=*rm,0"(i64* %addr, i64 %in)
+  ret i64 %in
+}
+
+; If we explicitly name a tied operand, then the code should lookup the operand
+; we were tied to for information about register class and so on.
+define i64 @tied_64bit_lookback_test(i64 %in) nounwind {
+; CHECK-LABEL: tied_64bit_lookback_test:
+; CHECK: OUTLO([[LO:r[0-9]+]]) OUTHI([[HI:r[0-9]+]]) INLO([[LO]]) INHI([[HI]])
+  %vars = call {i64, i32, i64} asm "OUTLO(${2:Q}) OUTHI(${2:R}) INLO(${3:Q}) INHI(${3:R})", "=r,=r,=r,2"(i64 %in)
+  %res = extractvalue {i64, i32, i64} %vars, 2
+  ret i64 %res
+}
diff --git a/test/CodeGen/ARM/interrupt-attr.ll b/test/CodeGen/ARM/interrupt-attr.ll
new file mode 100644
index 0000000..217fd69
--- /dev/null
+++ b/test/CodeGen/ARM/interrupt-attr.ll
@@ -0,0 +1,130 @@
+; RUN: llc -mtriple=arm-none-none-eabi -mcpu=cortex-a15 -o - %s | FileCheck --check-prefix=CHECK-A %s
+; RUN: llc -mtriple=thumb-none-none-eabi -mcpu=cortex-a15 -o - %s | FileCheck --check-prefix=CHECK-A-THUMB %s
+; RUN: llc -mtriple=thumb-apple-darwin -mcpu=cortex-m3 -o - %s | FileCheck --check-prefix=CHECK-M %s
+
+declare arm_aapcscc void @bar()
+
+@bigvar = global [16 x i32] zeroinitializer
+
+define arm_aapcscc void @irq_fn() alignstack(8) "interrupt"="IRQ" {
+  ; Must save all registers except banked sp and lr (we save lr anyway because
+  ; we actually need it at the end to execute the return ourselves).
+
+  ; Also need special function return setting pc and CPSR simultaneously.
+; CHECK-A-LABEL: irq_fn:
+; CHECK-A: push {r0, r1, r2, r3, r11, lr}
+; CHECK-A: add r11, sp, #16
+; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: bic sp, sp, #7
+; CHECK-A: bl bar
+; CHECK-A: sub sp, r11, #16
+; CHECK-A: pop {r0, r1, r2, r3, r11, lr}
+; CHECK-A: subs pc, lr, #4
+
+; CHECK-A-THUMB-LABEL: irq_fn:
+; CHECK-A-THUMB: push {r0, r1, r2, r3, r4, r7, lr}
+; CHECK-A-THUMB: mov r4, sp
+; CHECK-A-THUMB: add r7, sp, #20
+; CHECK-A-THUMB: bic r4, r4, #7
+; CHECK-A-THUMB: bl bar
+; CHECK-A-THUMB: sub.w r4, r7,  #20
+; CHECK-A-THUMB: mov sp, r4
+; CHECK-A-THUMB: pop.w {r0, r1, r2, r3, r4, r7, lr}
+; CHECK-A-THUMB: subs pc, lr, #4
+
+  ; Normal AAPCS function (r0-r3 pushed onto stack by hardware, lr set to
+  ; appropriate sentinel so no special return needed).
+; CHECK-M: push {r4, r7, lr}
+; CHECK-M: add r7, sp, #4
+; CHECK-M: sub sp, #4
+; CHECK-M: mov r4, sp
+; CHECK-M: mov sp, r4
+; CHECK-M: blx _bar
+; CHECK-M: subs r4, r7, #4
+; CHECK-M: mov sp, r4
+; CHECK-M: pop {r4, r7, pc}
+
+  call arm_aapcscc void @bar()
+  ret void
+}
+
+define arm_aapcscc void @fiq_fn() alignstack(8) "interrupt"="FIQ" {
+; CHECK-A-LABEL: fiq_fn:
+; CHECK-A: push {r0, r1, r2, r3, r4, r5, r6, r7, r11, lr}
+  ; 32 to get past r0, r1, ..., r7
+; CHECK-A: add r11, sp, #32
+; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: bic sp, sp, #7
+; [...]
+  ; 32 must match above
+; CHECK-A: sub sp, r11, #32
+; CHECK-A: pop {r0, r1, r2, r3, r4, r5, r6, r7, r11, lr}
+; CHECK-A: subs pc, lr, #4
+
+  %val = load volatile [16 x i32]* @bigvar
+  store volatile [16 x i32] %val, [16 x i32]* @bigvar
+  ret void
+}
+
+define arm_aapcscc void @swi_fn() alignstack(8) "interrupt"="SWI" {
+; CHECK-A-LABEL: swi_fn:
+; CHECK-A: push {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-A: add r11, sp, #44
+; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: bic sp, sp, #7
+; [...]
+; CHECK-A: sub sp, r11, #44
+; CHECK-A: pop {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-A: subs pc, lr, #0
+
+  %val = load volatile [16 x i32]* @bigvar
+  store volatile [16 x i32] %val, [16 x i32]* @bigvar
+  ret void
+}
+
+define arm_aapcscc void @undef_fn() alignstack(8) "interrupt"="UNDEF" {
+; CHECK-A-LABEL: undef_fn:
+; CHECK-A: push {r0, r1, r2, r3, r11, lr}
+; CHECK-A: add r11, sp, #16
+; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: bic sp, sp, #7
+; [...]
+; CHECK-A: sub sp, r11, #16
+; CHECK-A: pop {r0, r1, r2, r3, r11, lr}
+; CHECK-A: subs pc, lr, #0
+
+  call void @bar()
+  ret void
+}
+
+define arm_aapcscc void @abort_fn() alignstack(8) "interrupt"="ABORT" {
+; CHECK-A-LABEL: abort_fn:
+; CHECK-A: push {r0, r1, r2, r3, r11, lr}
+; CHECK-A: add r11, sp, #16
+; CHECK-A: sub sp, sp, #{{[0-9]+}}
+; CHECK-A: bic sp, sp, #7
+; [...]
+; CHECK-A: sub sp, r11, #16
+; CHECK-A: pop {r0, r1, r2, r3, r11, lr}
+; CHECK-A: subs pc, lr, #4
+
+  call void @bar()
+  ret void
+}
+
+@var = global double 0.0
+
+; We don't save VFP regs, since it would be a massive overhead in the general
+; case.
+define arm_aapcscc void @floating_fn() alignstack(8) "interrupt"="IRQ" {
+; CHECK-A-LABEL: floating_fn:
+; CHECK-A-NOT: vpush
+; CHECK-A-NOT: vstr
+; CHECK-A-NOT: vstm
+; CHECK-A: vadd.f64 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+  %lhs = load volatile double* @var
+  %rhs = load volatile double* @var
+  %sum = fadd double %lhs, %rhs
+  store double %sum, double* @var
+  ret void
+}
diff --git a/test/CodeGen/ARM/intrinsics-crypto.ll b/test/CodeGen/ARM/intrinsics-crypto.ll
new file mode 100644
index 0000000..c038fe6
--- /dev/null
+++ b/test/CodeGen/ARM/intrinsics-crypto.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=armv8 -mattr=+crypto | FileCheck %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_aesde(<16 x i8>* %a, <16 x i8> *%b) {
+  %tmp = load <16 x i8>* %a
+  %tmp2 = load <16 x i8>* %b
+  %tmp3 = call <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8> %tmp, <16 x i8> %tmp2)
+  ; CHECK: aesd.8 q{{[0-9]+}}, q{{[0-9]+}}
+  %tmp4 = call <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8> %tmp3, <16 x i8> %tmp2)
+  ; CHECK: aese.8 q{{[0-9]+}}, q{{[0-9]+}}
+  %tmp5 = call <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8> %tmp4)
+  ; CHECK: aesimc.8 q{{[0-9]+}}, q{{[0-9]+}}
+  %tmp6 = call <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8> %tmp5)
+  ; CHECK: aesmc.8 q{{[0-9]+}}, q{{[0-9]+}}
+  ret <16 x i8> %tmp6
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_sha(<4 x i32> *%a, <4 x i32> *%b, <4 x i32> *%c) {
+  %tmp = load <4 x i32>* %a
+  %tmp2 = load <4 x i32>* %b
+  %tmp3 = load <4 x i32>* %c
+  %res1 = call <4 x i32> @llvm.arm.neon.sha1h.v4i32(<4 x i32> %tmp)
+  ; CHECK: sha1h.32 q{{[0-9]+}}, q{{[0-9]+}}
+  %res2 = call <4 x i32> @llvm.arm.neon.sha1c.v4i32(<4 x i32> %tmp2, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha1c.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res3 = call <4 x i32> @llvm.arm.neon.sha1m.v4i32(<4 x i32> %res2, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha1m.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res4 = call <4 x i32> @llvm.arm.neon.sha1p.v4i32(<4 x i32> %res3, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha1p.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res5 = call <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32> %res4, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha1su0.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res6 = call <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32> %res5, <4 x i32> %res1)
+  ; CHECK: sha1su1.32 q{{[0-9]+}}, q{{[0-9]+}}
+  %res7 = call <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32> %res6, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha256h.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res8 = call <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32> %res7, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha256h2.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res9 = call <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32> %res8, <4 x i32> %tmp3, <4 x i32> %res1)
+  ; CHECK: sha256su1.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+  %res10 = call <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32> %res9, <4 x i32> %tmp3)
+  ; CHECK: sha256su0.32 q{{[0-9]+}}, q{{[0-9]+}}
+  ret <4 x i32> %res10
+}
+
+declare <16 x i8> @llvm.arm.neon.aesd.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.aese.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.aesimc.v16i8(<16 x i8>)
+declare <16 x i8> @llvm.arm.neon.aesmc.v16i8(<16 x i8>)
+declare <4 x i32> @llvm.arm.neon.sha1h.v4i32(<4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1c.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1m.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1p.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1su0.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256h.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256h2.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256su1.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha256su0.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.sha1su1.v4i32(<4 x i32>, <4 x i32>)
diff --git a/test/CodeGen/ARM/intrinsics-v8.ll b/test/CodeGen/ARM/intrinsics-v8.ll
new file mode 100644
index 0000000..247bfc1
--- /dev/null
+++ b/test/CodeGen/ARM/intrinsics-v8.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=armv8 -mattr=+db | FileCheck %s
+
+define void @test() {
+  ; CHECK: dmb sy
+  call void @llvm.arm.dmb(i32 15)
+  ; CHECK: dmb osh
+  call void @llvm.arm.dmb(i32 3)
+  ; CHECK: dsb sy
+  call void @llvm.arm.dsb(i32 15)
+  ; CHECK: dsb ishld
+  call void @llvm.arm.dsb(i32 9)
+  ; CHECK: sevl
+  tail call void @llvm.arm.sevl() nounwind
+  ret void
+}
+
+declare void @llvm.arm.dmb(i32)
+declare void @llvm.arm.dsb(i32)
+declare void @llvm.arm.sevl() nounwind
diff --git a/test/CodeGen/ARM/lit.local.cfg b/test/CodeGen/ARM/lit.local.cfg
index 4d75f58..8a3ba96 100644
--- a/test/CodeGen/ARM/lit.local.cfg
+++ b/test/CodeGen/ARM/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/ARM/long_shift.ll b/test/CodeGen/ARM/long_shift.ll
index a99a7ec..3e986d80 100644
--- a/test/CodeGen/ARM/long_shift.ll
+++ b/test/CodeGen/ARM/long_shift.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=arm | FileCheck %s
 
 define i64 @f0(i64 %A, i64 %B) {
-; CHECK: f0
+; CHECK-LABEL: f0:
 ; CHECK:      lsrs    r3, r3, #1
 ; CHECK-NEXT: rrx     r2, r2
 ; CHECK-NEXT: subs    r0, r0, r2
@@ -13,7 +13,7 @@ define i64 @f0(i64 %A, i64 %B) {
 }
 
 define i32 @f1(i64 %x, i64 %y) {
-; CHECK: f1
+; CHECK-LABEL: f1:
 ; CHECK: lsl{{.*}}r2
 	%a = shl i64 %x, %y
 	%b = trunc i64 %a to i32
@@ -21,7 +21,7 @@ define i32 @f1(i64 %x, i64 %y) {
 }
 
 define i32 @f2(i64 %x, i64 %y) {
-; CHECK: f2
+; CHECK-LABEL: f2:
 ; CHECK:      lsr{{.*}}r2
 ; CHECK-NEXT: rsb     r3, r2, #32
 ; CHECK-NEXT: sub     r2, r2, #32
@@ -34,7 +34,7 @@ define i32 @f2(i64 %x, i64 %y) {
 }
 
 define i32 @f3(i64 %x, i64 %y) {
-; CHECK: f3
+; CHECK-LABEL: f3:
 ; CHECK:      lsr{{.*}}r2
 ; CHECK-NEXT: rsb     r3, r2, #32
 ; CHECK-NEXT: sub     r2, r2, #32
diff --git a/test/CodeGen/ARM/misched-copy-arm.ll b/test/CodeGen/ARM/misched-copy-arm.ll
index c274545..5da335f 100644
--- a/test/CodeGen/ARM/misched-copy-arm.ll
+++ b/test/CodeGen/ARM/misched-copy-arm.ll
@@ -65,7 +65,7 @@ if.end28:                                         ; preds = %if.then24, %while.c
   %dst.1 = phi %struct.rtx_def* [ undef, %if.then24 ], [ %dst.0, %while.cond ], [ %dst.0, %while.cond ]
   %arrayidx30 = getelementptr inbounds %struct.rtx_def* %dst.1, i32 0, i32 1, i32 0
   %rtx31 = bitcast %union.rtunion_def* %arrayidx30 to %struct.rtx_def**
-  %0 = load %struct.rtx_def** %rtx31, align 4, !tbaa !0
+  %0 = load %struct.rtx_def** %rtx31, align 4
   br label %while.cond
 
 if.then46:                                        ; preds = %while.cond
@@ -77,7 +77,3 @@ if.end47:                                         ; preds = %while.cond
 }
 
 attributes #0 = { nounwind ssp }
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM/neon-spfp.ll b/test/CodeGen/ARM/neon-spfp.ll
index 5385668..dd2e67f 100644
--- a/test/CodeGen/ARM/neon-spfp.ll
+++ b/test/CodeGen/ARM/neon-spfp.ll
@@ -1,20 +1,20 @@
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 | FileCheck %s -check-prefix=LINUXA5
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 | FileCheck %s -check-prefix=LINUXA8
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 | FileCheck %s -check-prefix=LINUXA9
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 | FileCheck %s -check-prefix=LINUXA15
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift | FileCheck %s -check-prefix=LINUXSWIFT
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 | FileCheck %s -check-prefix=CHECK-LINUXA5
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK-LINUXA8
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 | FileCheck %s -check-prefix=CHECK-LINUXA9
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 | FileCheck %s -check-prefix=CHECK-LINUXA15
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift | FileCheck %s -check-prefix=CHECK-LINUXSWIFT
 
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA5
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA8
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA9
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFEA15
-; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift --enable-unsafe-fp-math | FileCheck %s -check-prefix=UNSAFESWIFT
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA5
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA8
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA9
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA15
+; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFESWIFT
 
-; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a5 | FileCheck %s -check-prefix=DARWINA5
-; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=DARWINA8
-; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a9 | FileCheck %s -check-prefix=DARWINA9
-; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a15 | FileCheck %s -check-prefix=DARWINA15
-; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=swift | FileCheck %s -check-prefix=DARWINSWIFT
+; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a5 | FileCheck %s -check-prefix=CHECK-DARWINA5
+; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK-DARWINA8
+; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a9 | FileCheck %s -check-prefix=CHECK-DARWINA9
+; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a15 | FileCheck %s -check-prefix=CHECK-DARWINA15
+; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=swift | FileCheck %s -check-prefix=CHECK-DARWINSWIFT
 
 ; This test makes sure we're not lowering VMUL.f32 D* (aka. NEON) for single-prec. FP ops, since
 ; NEON is not fully IEEE 754 compliant, unless unsafe-math is selected.
diff --git a/test/CodeGen/ARM/neon_spill.ll b/test/CodeGen/ARM/neon_spill.ll
index 277bd05..d286d16 100644
--- a/test/CodeGen/ARM/neon_spill.ll
+++ b/test/CodeGen/ARM/neon_spill.ll
@@ -24,7 +24,7 @@ declare arm_aapcs_vfpcc %2** @func4()
 define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
   call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
   %2 = call arm_aapcs_vfpcc  %0** @func2() nounwind
-  %3 = load %0** %2, align 4, !tbaa !0
+  %3 = load %0** %2, align 4
   store float 0.000000e+00, float* undef, align 4
   %4 = call arm_aapcs_vfpcc  %2* @func3(%2* undef, %2* undef, i32 2956) nounwind
   call arm_aapcs_vfpcc  void @func1(%0* %3, float* undef, float* undef, %2* undef)
@@ -35,11 +35,11 @@ define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
   %6 = call arm_aapcs_vfpcc  %2** @func4() nounwind
   %7 = call arm_aapcs_vfpcc  %2* @func3(%2* undef, %2* undef, i32 2971) nounwind
   %8 = fadd float undef, -1.000000e+05
-  store float %8, float* undef, align 16, !tbaa !3
+  store float %8, float* undef, align 16
   %9 = call arm_aapcs_vfpcc  i32 @rand() nounwind
   %10 = fmul float undef, 2.000000e+05
   %11 = fadd float %10, -1.000000e+05
-  store float %11, float* undef, align 4, !tbaa !3
+  store float %11, float* undef, align 4
   call void @llvm.arm.neon.vst4.v4i32(i8* undef, <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>, <4 x i32> <i32 1082130432, i32 1084227584, i32 1086324736, i32 1088421888>, <4 x i32> <i32 1090519040, i32 1091567616, i32 1092616192, i32 1093664768>, <4 x i32> <i32 1094713344, i32 1095761920, i32 1096810496, i32 1097859072>, i32 16) nounwind
   ret void
 }
@@ -47,8 +47,3 @@ define arm_aapcs_vfpcc void @foo(%3* nocapture) nounwind align 2 {
 declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
 
 declare arm_aapcs_vfpcc i32 @rand()
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"float", metadata !1}
diff --git a/test/CodeGen/ARM/no-fpu.ll b/test/CodeGen/ARM/no-fpu.ll
new file mode 100644
index 0000000..fff4bcc
--- /dev/null
+++ b/test/CodeGen/ARM/no-fpu.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-neon,-vfp2 | FileCheck --check-prefix=NONEON-NOVFP %s
+; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-neon | FileCheck --check-prefix=NONEON %s
+; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-vfp2 | FileCheck --check-prefix=NOVFP %s
+; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-neon,+vfp2 | FileCheck --check-prefix=NONEON-VFP %s
+
+; Check no NEON instructions are selected when feature is disabled.
+define void @neonop(i64* nocapture readonly %a, i64* nocapture %b) #0 {
+  %1 = bitcast i64* %a to <2 x i64>*
+  %wide.load = load <2 x i64>* %1, align 8
+  ; NONEON-NOVFP-NOT: vld1.64
+  ; NONEON-NOT: vld1.64
+  %add = add <2 x i64> %wide.load, %wide.load
+  ; NONEON-NOVFP-NOT: vadd.i64
+  ; NONEON-NOT: vadd.i64
+  %2 = bitcast i64* %b to <2 x i64>*
+  store <2 x i64> %add, <2 x i64>* %2, align 8
+  ; NONEON-NOVFP-NOT: vst1.64
+  ; NONEON-NOT: vst1.64
+  ret void
+}
+
+; Likewise with VFP instructions.
+define double @fpmult(double %a, double %b) {
+  %res = fmul double %a, %b
+  ; NONEON-NOVFP-NOT: vmov
+  ; NONEON-NOVFP-NOT: vmul.f64
+  ; NOVFP-NOT: vmov
+  ; NOVFP-NOT: vmul.f64
+  ; NONEON-VFP: vmov
+  ; NONEON-VFP: vmul.f64
+  ret double %res
+}
+
diff --git a/test/CodeGen/ARM/noreturn.ll b/test/CodeGen/ARM/noreturn.ll
new file mode 100644
index 0000000..4c876ce
--- /dev/null
+++ b/test/CodeGen/ARM/noreturn.ll
@@ -0,0 +1,50 @@
+; RUN: llc -O3 -o - %s | FileCheck %s
+; Test case from PR16882.
+target triple = "thumbv7s-apple-ios"
+
+define i32 @test1() {
+; CHECK-LABEL: @test1
+; CHECK-NOT: push
+entry:
+  tail call void @overflow() #0
+  unreachable
+}
+
+; Function Attrs: noreturn nounwind
+declare void @overflow() #0
+
+define i32 @test2(i32 %x, i32 %y) {
+; CHECK-LABEL: @test2
+; CHECK-NOT: push
+; CHECK-NOT: pop
+entry:
+  %conv = sext i32 %x to i64
+  %conv1 = sext i32 %y to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %conv2 = trunc i64 %mul to i32
+  %conv3 = sext i32 %conv2 to i64
+  %cmp = icmp eq i64 %mul, %conv3
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void @overflow() #0
+  unreachable
+
+if.end:                                           ; preds = %entry
+  ret i32 %conv2
+}
+
+; Test case for PR17825.
+define i32 @test3() {
+; CHECK-LABEL: @test3
+; CHECK: push
+entry:
+  tail call void @overflow_with_unwind() #1
+  unreachable
+}
+
+; Function Attrs: noreturn
+declare void @overflow_with_unwind() #1
+
+attributes #0 = { noreturn nounwind }
+attributes #1 = { noreturn }
diff --git a/test/CodeGen/ARM/optselect-regclass.ll b/test/CodeGen/ARM/optselect-regclass.ll
new file mode 100644
index 0000000..1aa4520
--- /dev/null
+++ b/test/CodeGen/ARM/optselect-regclass.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=arm -mcpu=swift -verify-machineinstrs
+%union.opcode.0.2.5.8.15.28 = type { i32 }
+
+@opcode = external global %union.opcode.0.2.5.8.15.28, align 4
+@operands = external hidden global [50 x i8], align 4
+@.str86 = external hidden unnamed_addr constant [13 x i8], align 1
+
+; Function Attrs: nounwind ssp
+define void @xfr() {
+entry:
+  %bf.load4 = load i32* getelementptr inbounds (%union.opcode.0.2.5.8.15.28* @opcode, i32 0, i32 0), align 4
+  %bf.clear10 = and i32 %bf.load4, 65535
+  %and11 = and i32 %bf.load4, 32768
+  %tobool12 = icmp ne i32 %and11, 0
+  %cond13 = select i1 %tobool12, i32 1073676288, i32 0
+  %or = or i32 %cond13, %bf.clear10
+  %shl = shl nuw i32 %or, 2
+  %add = add i32 0, %shl
+  tail call void (i8*, i32, i32, i8*, ...)* @__sprintf_chk(i8* getelementptr inbounds ([50 x i8]* @operands, i32 0, i32 0), i32 0, i32 50, i8* getelementptr inbounds ([13 x i8]* @.str86, i32 0, i32 0), i32 undef, i32 undef, i32 %add)
+  ret void
+}
+
+declare void @__sprintf_chk(i8*, i32, i32, i8*, ...)
diff --git a/test/CodeGen/ARM/pic.ll b/test/CodeGen/ARM/pic.ll
new file mode 100644
index 0000000..9fc7a63
--- /dev/null
+++ b/test/CodeGen/ARM/pic.ll
@@ -0,0 +1,23 @@
+; Check the function call in PIC relocation model.
+
+; If the relocation model is PIC, then the "bl" instruction for the function
+; call to the external function should come with PLT fixup type.
+
+; RUN:  llc < %s -mtriple=armv7-unknown-linux-gnueabi \
+; RUN:           -relocation-model=pic -fast-isel -verify-machineinstrs \
+; RUN:    | FileCheck %s
+
+define void @test() {
+entry:
+
+  %0 = call i32 @get()
+; CHECK: bl get(PLT)
+
+  call void @put(i32 %0)
+; CHECK: bl put(PLT)
+
+  ret void
+}
+
+declare i32 @get()
+declare void @put(i32)
diff --git a/test/CodeGen/ARM/prefetch-thumb.ll b/test/CodeGen/ARM/prefetch-thumb.ll
new file mode 100644
index 0000000..e6f6ae8
--- /dev/null
+++ b/test/CodeGen/ARM/prefetch-thumb.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=thumb -mattr=+v7         | FileCheck %s -check-prefix=THUMB2
+; TODO: This test case will be merged back into prefetch.ll when ARM mode issue is solved.
+
+declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
+
+define void @t6() {
+entry:
+;ARM: t6:
+;ARM: pld [sp]
+;ARM: pld [sp, #50]
+
+;THUMB2: t6:
+;THUMB2: pld [sp]
+;THUMB2: pld [sp, #50]
+
+%red = alloca [100 x i8], align 1
+%0 = getelementptr inbounds [100 x i8]* %red, i32 0, i32 0
+%1 = getelementptr inbounds [100 x i8]* %red, i32 0, i32 50
+call void @llvm.prefetch(i8* %0, i32 0, i32 3, i32 1)
+call void @llvm.prefetch(i8* %1, i32 0, i32 3, i32 1)
+ret void
+}
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index 3fe2bb8..25484f4 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -239,10 +239,9 @@ bb14:                                             ; preds = %bb6
 ; PR7157
 define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
 ; CHECK-LABEL:        t9:
-; CHECK:        vldr
-; CHECK-NOT:    vmov d{{.*}}, d16
-; CHECK:        vmov.i32 d17
+; CHECK: vmov.i32 d16, #0x0
 ; CHECK-NEXT:   vst1.64 {d16, d17}, [r0:128]
+; CHECK-NEXT:   vorr d17, d16, d16
 ; CHECK-NEXT:   vst1.64 {d16, d17}, [r0:128]
   %3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
   %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/ARM/select-imm.ll b/test/CodeGen/ARM/select-imm.ll
index 5e7506a..6f4bfb8 100644
--- a/test/CodeGen/ARM/select-imm.ll
+++ b/test/CodeGen/ARM/select-imm.ll
@@ -11,7 +11,7 @@ entry:
 
 ; ARMT2-LABEL: t1:
 ; ARMT2: movw [[R:r[0-1]]], #357
-; ARMT2: movgt [[R]], #123
+; ARMT2: movwgt [[R]], #123
 
 ; THUMB2-LABEL: t1:
 ; THUMB2: movw [[R:r[0-1]]], #357
@@ -25,9 +25,9 @@ entry:
 define i32 @t2(i32 %c) nounwind readnone {
 entry:
 ; ARM-LABEL: t2:
-; ARM: mov [[R:r[0-1]]], #123
-; ARM: movgt [[R]], #101
-; ARM: orrgt [[R]], [[R]], #256
+; ARM: mov [[R:r[0-9]+]], #101
+; ARM: orr [[R]], [[R]], #256
+; ARM: movle [[R]], #123
 
 ; ARMT2-LABEL: t2:
 ; ARMT2: mov [[R:r[0-1]]], #123
@@ -50,7 +50,7 @@ entry:
 
 ; ARMT2-LABEL: t3:
 ; ARMT2: mov [[R:r[0-1]]], #0
-; ARMT2: moveq [[R]], #1
+; ARMT2: movweq [[R]], #1
 
 ; THUMB2-LABEL: t3:
 ; THUMB2: mov{{(s|\.w)}} [[R:r[0-1]]], #0
diff --git a/test/CodeGen/ARM/select-undef.ll b/test/CodeGen/ARM/select-undef.ll
new file mode 100644
index 0000000..23f7eb8
--- /dev/null
+++ b/test/CodeGen/ARM/select-undef.ll
@@ -0,0 +1,7 @@
+; RUN: llc < %s -march=arm -mcpu=swift -verify-machineinstrs
+define i32 @func(i32 %arg0, i32 %arg1) {
+entry:
+  %cmp = icmp slt i32 %arg0, 10
+  %v = select i1 %cmp, i32 undef, i32 %arg1
+  ret i32 %v
+}
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index d5c3a27..ed006d6 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -59,7 +59,7 @@ entry:
 define double @f7(double %a, double %b) {
 ;CHECK-LABEL: f7:
 ;CHECK: movlt
-;CHECK: movlt
+;CHECK: movge
 ;CHECK-VFP-LABEL: f7:
 ;CHECK-VFP: vmovmi
     %tmp = fcmp olt double %a, 1.234e+00
@@ -75,7 +75,7 @@ define double @f7(double %a, double %b) {
 ; into the constant pool based on the value of the "icmp". If we have one "it"
 ; block generated, odds are good that we have close to the ideal code for this:
 ;
-; CHECK-NEON:      _f8:
+; CHECK-NEON-LABEL: f8:
 ; CHECK-NEON:      movw    [[R3:r[0-9]+]], #1123
 ; CHECK-NEON:      adr     [[R2:r[0-9]+]], LCPI7_0
 ; CHECK-NEON-NEXT: cmp     r0, [[R3]]
@@ -113,7 +113,7 @@ entry:
   ret void
 }
 
-; CHECK: f10
+; CHECK-LABEL: f10:
 define float @f10(i32 %a, i32 %b) nounwind uwtable readnone ssp {
 ; CHECK-NOT: floatsisf
   %1 = icmp eq i32 %a, %b
@@ -122,7 +122,7 @@ define float @f10(i32 %a, i32 %b) nounwind uwtable readnone ssp {
   ret float %3
 }
 
-; CHECK: f11
+; CHECK-LABEL: f11:
 define float @f11(i32 %a, i32 %b) nounwind uwtable readnone ssp {
 ; CHECK-NOT: floatsisf
   %1 = icmp eq i32 %a, %b
@@ -130,7 +130,7 @@ define float @f11(i32 %a, i32 %b) nounwind uwtable readnone ssp {
   ret float %2
 }
 
-; CHECK: f12
+; CHECK-LABEL: f12:
 define float @f12(i32 %a, i32 %b) nounwind uwtable readnone ssp {
 ; CHECK-NOT: floatunsisf
   %1 = icmp eq i32 %a, %b
diff --git a/test/CodeGen/ARM/setcc-sentinals.ll b/test/CodeGen/ARM/setcc-sentinals.ll
new file mode 100644
index 0000000..8878f9b
--- /dev/null
+++ b/test/CodeGen/ARM/setcc-sentinals.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mcpu=cortex-a8 -march=arm -asm-verbose=false | FileCheck %s
+
+define zeroext i1 @test0(i32 %x) nounwind {
+; CHECK-LABEL: test0:
+; CHECK-NEXT: add [[REG:(r[0-9]+)|(lr)]], r0, #1
+; CHECK-NEXT: mov r0, #0
+; CHECK-NEXT: cmp [[REG]], #1
+; CHECK-NEXT: movwhi r0, #1
+; CHECK-NEXT: bx  lr
+  %cmp1 = icmp ne i32 %x, -1
+  %not.cmp = icmp ne i32 %x, 0
+  %.cmp1 = and i1 %cmp1, %not.cmp
+  ret i1 %.cmp1
+}
diff --git a/test/CodeGen/ARM/sincos.ll b/test/CodeGen/ARM/sincos.ll
new file mode 100644
index 0000000..30b2664
--- /dev/null
+++ b/test/CodeGen/ARM/sincos.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 | FileCheck %s --check-prefix=NOOPT
+; RUN: llc < %s -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS
+
+; Combine sin / cos into a single call.
+; rdar://12856873
+
+define float @test1(float %x) nounwind {
+entry:
+; SINCOS-LABEL: test1:
+; SINCOS: bl ___sincosf_stret
+
+; NOOPT-LABEL: test1:
+; NOOPT: bl _sinf
+; NOOPT: bl _cosf
+  %call = tail call float @sinf(float %x) nounwind readnone
+  %call1 = tail call float @cosf(float %x) nounwind readnone
+  %add = fadd float %call, %call1
+  ret float %add
+}
+
+define double @test2(double %x) nounwind {
+entry:
+; SINCOS-LABEL: test2:
+; SINCOS: bl ___sincos_stret
+
+; NOOPT-LABEL: test2:
+; NOOPT: bl _sin
+; NOOPT: bl _cos
+  %call = tail call double @sin(double %x) nounwind readnone
+  %call1 = tail call double @cos(double %x) nounwind readnone
+  %add = fadd double %call, %call1
+  ret double %add
+}
+
+declare float  @sinf(float) readonly
+declare double @sin(double) readonly
+declare float @cosf(float) readonly
+declare double @cos(double) readonly
diff --git a/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll b/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
new file mode 100644
index 0000000..f5cda14
--- /dev/null
+++ b/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
@@ -0,0 +1,32 @@
+; RUN: llc -O3 -mcpu=swift -mtriple=armv7s-apple-ios6.0.0 %s -o /dev/null
+; rdar://14811848
+
+; Make sure that we do not emit the BMOVPCB_CALL instruction for now or if we
+; fix the assumptions in its implementation that we do not crash when doing it.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "armv7s-apple-ios6.0.0"
+
+@main.title = private unnamed_addr constant [15 x i8] c"foo and stuff\0A\00", align 1
+@.str = private unnamed_addr constant [3 x i8] c"%s\00", align 1
+
+; Function Attrs: nounwind optsize ssp
+define i32 @main() #0 {
+entry:
+  %title = alloca [15 x i8], align 1
+  %0 = getelementptr inbounds [15 x i8]* %title, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* getelementptr inbounds ([15 x i8]* @main.title, i32 0, i32 0), i32 15, i32 1, i1 false)
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i8* %0) #3
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1
+
+; Function Attrs: nounwind optsize
+declare i32 @printf(i8* nocapture readonly, ...) #2
+
+attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind optsize }
diff --git a/test/CodeGen/ARM/struct-byval-frame-index.ll b/test/CodeGen/ARM/struct-byval-frame-index.ll
index ae68ce5..465ee12 100644
--- a/test/CodeGen/ARM/struct-byval-frame-index.ll
+++ b/test/CodeGen/ARM/struct-byval-frame-index.ll
@@ -72,10 +72,10 @@ declare void @SetMotionVectorsMB(%structK* nocapture, i32) #1
 ; Function Attrs: nounwind
 define void @set_stored_macroblock_parameters() #1 {
 entry:
-  %0 = load %structB** @img, align 4, !tbaa !0
-  %1 = load i32* undef, align 4, !tbaa !3
+  %0 = load %structB** @img, align 4
+  %1 = load i32* undef, align 4
   %mb_data = getelementptr inbounds %structB* %0, i32 0, i32 61
-  %2 = load %structK** %mb_data, align 4, !tbaa !0
+  %2 = load %structK** %mb_data, align 4
   br label %for.body
 
 for.body:                                         ; preds = %for.body, %entry
@@ -109,7 +109,7 @@ for.body119:                                      ; preds = %for.body119, %for.c
   br i1 undef, label %for.body119, label %if.end164
 
 if.end164:                                        ; preds = %for.body119, %for.cond47.preheader, %if.end43
-  store i32*** null, i32**** @cofDC, align 4, !tbaa !0
+  store i32*** null, i32**** @cofDC, align 4
   %mb_type = getelementptr inbounds %structK* %2, i32 %1, i32 8
   br i1 undef, label %if.end230, label %if.then169
 
@@ -134,7 +134,7 @@ if.then233:                                       ; preds = %if.end230
 
 if.end236:                                        ; preds = %if.end230
   %cmp242 = icmp ne i16 undef, 8
-  %4 = load i32* @luma_transform_size_8x8_flag, align 4, !tbaa !3
+  %4 = load i32* @luma_transform_size_8x8_flag, align 4
   %tobool245 = icmp ne i32 %4, 0
   %or.cond812 = or i1 %cmp242, %tobool245
   br i1 %or.cond812, label %if.end249, label %land.lhs.true246
@@ -150,11 +150,11 @@ if.then248:                                       ; preds = %land.lhs.true246
   br label %if.end249
 
 if.end249:                                        ; preds = %if.then248, %land.lhs.true246, %if.end236
-  %5 = load i32* @luma_transform_size_8x8_flag, align 4, !tbaa !3
-  %6 = load %structA** @rdopt, align 4, !tbaa !0
+  %5 = load i32* @luma_transform_size_8x8_flag, align 4
+  %6 = load %structA** @rdopt, align 4
   %luma_transform_size_8x8_flag264 = getelementptr inbounds %structA* %6, i32 0, i32 21
-  store i32 %5, i32* %luma_transform_size_8x8_flag264, align 4, !tbaa !3
-  %7 = load i32* undef, align 4, !tbaa !3
+  store i32 %5, i32* %luma_transform_size_8x8_flag264, align 4
+  %7 = load i32* undef, align 4
   %add281 = add nsw i32 %7, 0
   br label %for.body285
 
@@ -162,36 +162,36 @@ for.body285:                                      ; preds = %for.inc503, %if.end
   %8 = phi %structB* [ undef, %if.end249 ], [ %.pre1155, %for.inc503 ]
   %i.21103 = phi i32 [ 0, %if.end249 ], [ %inc504, %for.inc503 ]
   %block_x286 = getelementptr inbounds %structB* %8, i32 0, i32 37
-  %9 = load i32* %block_x286, align 4, !tbaa !3
+  %9 = load i32* %block_x286, align 4
   %add287 = add nsw i32 %9, %i.21103
   %shr289 = ashr i32 %i.21103, 1
   %add290 = add nsw i32 %shr289, 0
   %arrayidx292 = getelementptr inbounds %structK* %2, i32 %1, i32 15, i32 %add290
-  %10 = load %structM** @enc_picture, align 4, !tbaa !0
+  %10 = load %structM** @enc_picture, align 4
   %ref_idx = getelementptr inbounds %structM* %10, i32 0, i32 35
-  %11 = load i8**** %ref_idx, align 4, !tbaa !0
-  %12 = load i8*** %11, align 4, !tbaa !0
+  %11 = load i8**** %ref_idx, align 4
+  %12 = load i8*** %11, align 4
   %arrayidx313 = getelementptr inbounds i8** %12, i32 %add281
-  %13 = load i8** %arrayidx313, align 4, !tbaa !0
+  %13 = load i8** %arrayidx313, align 4
   %arrayidx314 = getelementptr inbounds i8* %13, i32 %add287
-  store i8 -1, i8* %arrayidx314, align 1, !tbaa !1
-  %14 = load %structB** @img, align 4, !tbaa !0
+  store i8 -1, i8* %arrayidx314, align 1
+  %14 = load %structB** @img, align 4
   %MbaffFrameFlag327 = getelementptr inbounds %structB* %14, i32 0, i32 100
-  %15 = load i32* %MbaffFrameFlag327, align 4, !tbaa !3
+  %15 = load i32* %MbaffFrameFlag327, align 4
   %tobool328 = icmp eq i32 %15, 0
   br i1 %tobool328, label %if.end454, label %if.then329
 
 if.then329:                                       ; preds = %for.body285
-  %16 = load %structA** @rdopt, align 4, !tbaa !0
+  %16 = load %structA** @rdopt, align 4
   br label %if.end454
 
 if.end454:                                        ; preds = %if.then329, %for.body285
-  %17 = load i32* %arrayidx292, align 4, !tbaa !3
+  %17 = load i32* %arrayidx292, align 4
   %cmp457 = icmp eq i32 %17, 0
   br i1 %cmp457, label %if.then475, label %lor.lhs.false459
 
 lor.lhs.false459:                                 ; preds = %if.end454
-  %18 = load i32* %mb_type, align 4, !tbaa !3
+  %18 = load i32* %mb_type, align 4
   switch i32 %18, label %for.inc503 [
     i32 9, label %if.then475
     i32 10, label %if.then475
@@ -200,12 +200,12 @@ lor.lhs.false459:                                 ; preds = %if.end454
   ]
 
 if.then475:                                       ; preds = %lor.lhs.false459, %lor.lhs.false459, %lor.lhs.false459, %lor.lhs.false459, %if.end454
-  store i16 0, i16* undef, align 2, !tbaa !4
+  store i16 0, i16* undef, align 2
   br label %for.inc503
 
 for.inc503:                                       ; preds = %if.then475, %lor.lhs.false459
   %inc504 = add nsw i32 %i.21103, 1
-  %.pre1155 = load %structB** @img, align 4, !tbaa !0
+  %.pre1155 = load %structB** @img, align 4
   br label %for.body285
 }
 
@@ -216,10 +216,4 @@ declare void @update_offset_params(i32, i32) #1
 declare void @RestoreMVBlock8x8(i32, i32, %structN* byval nocapture, i32) #1
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"int", metadata !1}
-!4 = metadata !{metadata !"short", metadata !1}
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/ARM/struct_byval.ll b/test/CodeGen/ARM/struct_byval.ll
index 012b994..130925a 100644
--- a/test/CodeGen/ARM/struct_byval.ll
+++ b/test/CodeGen/ARM/struct_byval.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios6.0 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios6.0 | FileCheck %s -check-prefix=THUMB
 
 ; rdar://9877866
 %struct.SmallStruct = type { i32, [8 x i32], [37 x i8] }
@@ -10,6 +11,10 @@ entry:
 ; CHECK: ldr
 ; CHECK: str
 ; CHECK-NOT:bne
+; THUMB-LABEL: f:
+; THUMB: ldr
+; THUMB: str
+; THUMB-NOT:bne
   %st = alloca %struct.SmallStruct, align 4
   %call = call i32 @e1(%struct.SmallStruct* byval %st)
   ret i32 0
@@ -23,6 +28,11 @@ entry:
 ; CHECK: sub
 ; CHECK: str
 ; CHECK: bne
+; THUMB-LABEL: g:
+; THUMB: ldr
+; THUMB: sub
+; THUMB: str
+; THUMB: bne
   %st = alloca %struct.LargeStruct, align 4
   %call = call i32 @e2(%struct.LargeStruct* byval %st)
   ret i32 0
@@ -36,6 +46,11 @@ entry:
 ; CHECK: sub
 ; CHECK: vst1
 ; CHECK: bne
+; THUMB-LABEL: h:
+; THUMB: vld1
+; THUMB: sub
+; THUMB: vst1
+; THUMB: bne
   %st = alloca %struct.LargeStruct, align 16
   %call = call i32 @e3(%struct.LargeStruct* byval align 16 %st)
   ret i32 0
@@ -49,8 +64,10 @@ declare i32 @e3(%struct.LargeStruct* nocapture byval align 16 %in) nounwind
 ; We can't do tail call since address of s is passed to the callee and part of
 ; s is in caller's local frame.
 define void @f3(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
-; CHECK: f3
+; CHECK-LABEL: f3
 ; CHECK: bl _consumestruct
+; THUMB-LABEL: f3
+; THUMB: blx _consumestruct
 entry:
   %0 = bitcast %struct.SmallStruct* %s to i8*
   tail call void @consumestruct(i8* %0, i32 80) optsize
@@ -58,8 +75,10 @@ entry:
 }
 
 define void @f4(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
-; CHECK: f4
+; CHECK-LABEL: f4
 ; CHECK: bl _consumestruct
+; THUMB-LABEL: f4
+; THUMB: blx _consumestruct
 entry:
   %addr = getelementptr inbounds %struct.SmallStruct* %s, i32 0, i32 0
   %0 = bitcast i32* %addr to i8*
@@ -69,8 +88,10 @@ entry:
 
 ; We can do tail call here since s is in the incoming argument area.
 define void @f5(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
-; CHECK: f5
+; CHECK-LABEL: f5
 ; CHECK: b _consumestruct
+; THUMB-LABEL: f5
+; THUMB: b.w _consumestruct
 entry:
   %0 = bitcast %struct.SmallStruct* %s to i8*
   tail call void @consumestruct(i8* %0, i32 80) optsize
@@ -78,8 +99,10 @@ entry:
 }
 
 define void @f6(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
-; CHECK: f6
+; CHECK-LABEL: f6
 ; CHECK: b _consumestruct
+; THUMB-LABEL: f6
+; THUMB: b.w _consumestruct
 entry:
   %addr = getelementptr inbounds %struct.SmallStruct* %s, i32 0, i32 0
   %0 = bitcast i32* %addr to i8*
@@ -88,3 +111,19 @@ entry:
 }
 
 declare void @consumestruct(i8* nocapture %structp, i32 %structsize) nounwind
+
+; PR17309
+%struct.I.8 = type { [10 x i32], [3 x i8] }
+
+declare void @use_I(%struct.I.8* byval)
+define void @test_I_16() {
+; CHECK-LABEL: test_I_16
+; CHECK: ldrb
+; CHECK: strb
+; THUMB-LABEL: test_I_16
+; THUMB: ldrb
+; THUMB: strb
+entry:
+  call void @use_I(%struct.I.8* byval align 16 undef)
+  ret void
+}
diff --git a/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll b/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll
new file mode 100644
index 0000000..1899269
--- /dev/null
+++ b/test/CodeGen/ARM/struct_byval_arm_t1_t2.ll
@@ -0,0 +1,1523 @@
+;RUN: llc < %s -mtriple=armv7-none-linux-gnueabi   -mattr=+neon -verify-machineinstrs -filetype=obj | llvm-objdump -triple armv7-none-linux-gnueabi   -disassemble - | FileCheck %s --check-prefix=ARM
+;RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi -mattr=+neon -verify-machineinstrs -filetype=obj | llvm-objdump -triple thumbv7-none-linux-gnueabi -disassemble - | FileCheck %s --check-prefix=THUMB2
+;RUN: llc < %s -mtriple=armv7-none-linux-gnueabi   -mattr=-neon -verify-machineinstrs -filetype=obj | llvm-objdump -triple armv7-none-linux-gnueabi   -disassemble - | FileCheck %s --check-prefix=NO_NEON
+;We want to have both positive and negative checks for thumb1. These checks
+;are not easy to do in a single pass so we generate the output once to a
+;temp file and run filecheck twice with different prefixes.
+;RUN: llc < %s -mtriple=thumbv5-none-linux-gnueabi              -verify-machineinstrs -filetype=obj | llvm-objdump -triple thumbv5-none-linux-gnueabi -disassemble - > %t
+;RUN: cat %t | FileCheck %s --check-prefix=THUMB1
+;RUN: cat %t | FileCheck %s --check-prefix=T1POST
+
+;This file contains auto generated tests for the lowering of passing structs
+;byval in the arm backend. We have tests for both packed and unpacked
+;structs at varying alignments. Each test is run for arm, thumb2 and thumb1.
+;We check for the strings in the generated object code using llvm-objdump
+;because it provides better assurance that we are generating instructions
+;for the correct architecture. Otherwise we could accidently generate an
+;ARM instruction for THUMB1 and wouldn't detect it because the assembly
+;code representation is the same, but the object code would be generated
+;incorrectly. For each test we check for the label, a load instruction of the
+;correct form, a branch if it will be generated with a loop, and the leftover
+;cleanup if the number of bytes does not divide evenly by the store size
+
+%struct.A = type <{ [ 10 x i32 ] }> ; 40 bytes
+declare void @use_A(%struct.A* byval)
+%struct.B = type <{ [ 10 x i32 ], i8 }> ; 41 bytes
+declare void @use_B(%struct.B* byval)
+%struct.C = type <{ [ 10 x i32 ], [ 3 x i8 ] }> ; 43 bytes
+declare void @use_C(%struct.C* byval)
+%struct.D = type <{ [ 100 x i32 ] }> ; 400 bytes
+declare void @use_D(%struct.D* byval)
+%struct.E = type <{ [ 100 x i32 ], i8 }> ; 401 bytes
+declare void @use_E(%struct.E* byval)
+%struct.F = type <{ [ 100 x i32 ], [ 3 x i8 ] }> ; 403 bytes
+declare void @use_F(%struct.F* byval)
+%struct.G = type  { [ 10 x i32 ] }  ; 40 bytes
+declare void @use_G(%struct.G* byval)
+%struct.H = type  { [ 10 x i32 ], i8 }  ; 41 bytes
+declare void @use_H(%struct.H* byval)
+%struct.I = type  { [ 10 x i32 ], [ 3 x i8 ] }  ; 43 bytes
+declare void @use_I(%struct.I* byval)
+%struct.J = type  { [ 100 x i32 ] }  ; 400 bytes
+declare void @use_J(%struct.J* byval)
+%struct.K = type  { [ 100 x i32 ], i8 }  ; 401 bytes
+declare void @use_K(%struct.K* byval)
+%struct.L = type  { [ 100 x i32 ], [ 3 x i8 ] }  ; 403 bytes
+declare void @use_L(%struct.L* byval)
+
+;ARM-LABEL:    test_A_1:
+;THUMB2-LABEL: test_A_1:
+;NO_NEON-LABEL:test_A_1:
+;THUMB1-LABEL: test_A_1:
+;T1POST-LABEL: test_A_1:
+  define void @test_A_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.A, align 1
+    call void @use_A(%struct.A* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_A_2:
+;THUMB2-LABEL: test_A_2:
+;NO_NEON-LABEL:test_A_2:
+;THUMB1-LABEL: test_A_2:
+;T1POST-LABEL: test_A_2:
+  define void @test_A_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.A, align 2
+    call void @use_A(%struct.A* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_A_4:
+;THUMB2-LABEL: test_A_4:
+;NO_NEON-LABEL:test_A_4:
+;THUMB1-LABEL: test_A_4:
+;T1POST-LABEL: test_A_4:
+  define void @test_A_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.A, align 4
+    call void @use_A(%struct.A* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_A_8:
+;THUMB2-LABEL: test_A_8:
+;NO_NEON-LABEL:test_A_8:
+;THUMB1-LABEL: test_A_8:
+;T1POST-LABEL: test_A_8:
+  define void @test_A_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.A, align 8
+    call void @use_A(%struct.A* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_A_16:
+;THUMB2-LABEL: test_A_16:
+;NO_NEON-LABEL:test_A_16:
+;THUMB1-LABEL: test_A_16:
+;T1POST-LABEL: test_A_16:
+  define void @test_A_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.A, align 16
+    call void @use_A(%struct.A* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_B_1:
+;THUMB2-LABEL: test_B_1:
+;NO_NEON-LABEL:test_B_1:
+;THUMB1-LABEL: test_B_1:
+;T1POST-LABEL: test_B_1:
+  define void @test_B_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.B, align 1
+    call void @use_B(%struct.B* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_B_2:
+;THUMB2-LABEL: test_B_2:
+;NO_NEON-LABEL:test_B_2:
+;THUMB1-LABEL: test_B_2:
+;T1POST-LABEL: test_B_2:
+  define void @test_B_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.B, align 2
+    call void @use_B(%struct.B* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_B_4:
+;THUMB2-LABEL: test_B_4:
+;NO_NEON-LABEL:test_B_4:
+;THUMB1-LABEL: test_B_4:
+;T1POST-LABEL: test_B_4:
+  define void @test_B_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.B, align 4
+    call void @use_B(%struct.B* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_B_8:
+;THUMB2-LABEL: test_B_8:
+;NO_NEON-LABEL:test_B_8:
+;THUMB1-LABEL: test_B_8:
+;T1POST-LABEL: test_B_8:
+  define void @test_B_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.B, align 8
+    call void @use_B(%struct.B* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_B_16:
+;THUMB2-LABEL: test_B_16:
+;NO_NEON-LABEL:test_B_16:
+;THUMB1-LABEL: test_B_16:
+;T1POST-LABEL: test_B_16:
+  define void @test_B_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.B, align 16
+    call void @use_B(%struct.B* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_C_1:
+;THUMB2-LABEL: test_C_1:
+;NO_NEON-LABEL:test_C_1:
+;THUMB1-LABEL: test_C_1:
+;T1POST-LABEL: test_C_1:
+  define void @test_C_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.C, align 1
+    call void @use_C(%struct.C* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_C_2:
+;THUMB2-LABEL: test_C_2:
+;NO_NEON-LABEL:test_C_2:
+;THUMB1-LABEL: test_C_2:
+;T1POST-LABEL: test_C_2:
+  define void @test_C_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.C, align 2
+    call void @use_C(%struct.C* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_C_4:
+;THUMB2-LABEL: test_C_4:
+;NO_NEON-LABEL:test_C_4:
+;THUMB1-LABEL: test_C_4:
+;T1POST-LABEL: test_C_4:
+  define void @test_C_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.C, align 4
+    call void @use_C(%struct.C* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_C_8:
+;THUMB2-LABEL: test_C_8:
+;NO_NEON-LABEL:test_C_8:
+;THUMB1-LABEL: test_C_8:
+;T1POST-LABEL: test_C_8:
+  define void @test_C_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.C, align 8
+    call void @use_C(%struct.C* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_C_16:
+;THUMB2-LABEL: test_C_16:
+;NO_NEON-LABEL:test_C_16:
+;THUMB1-LABEL: test_C_16:
+;T1POST-LABEL: test_C_16:
+  define void @test_C_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.C, align 16
+    call void @use_C(%struct.C* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_D_1:
+;THUMB2-LABEL: test_D_1:
+;NO_NEON-LABEL:test_D_1:
+;THUMB1-LABEL: test_D_1:
+;T1POST-LABEL: test_D_1:
+  define void @test_D_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.D, align 1
+    call void @use_D(%struct.D* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_D_2:
+;THUMB2-LABEL: test_D_2:
+;NO_NEON-LABEL:test_D_2:
+;THUMB1-LABEL: test_D_2:
+;T1POST-LABEL: test_D_2:
+  define void @test_D_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.D, align 2
+    call void @use_D(%struct.D* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_D_4:
+;THUMB2-LABEL: test_D_4:
+;NO_NEON-LABEL:test_D_4:
+;THUMB1-LABEL: test_D_4:
+;T1POST-LABEL: test_D_4:
+  define void @test_D_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.D, align 4
+    call void @use_D(%struct.D* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_D_8:
+;THUMB2-LABEL: test_D_8:
+;NO_NEON-LABEL:test_D_8:
+;THUMB1-LABEL: test_D_8:
+;T1POST-LABEL: test_D_8:
+  define void @test_D_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.D, align 8
+    call void @use_D(%struct.D* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_D_16:
+;THUMB2-LABEL: test_D_16:
+;NO_NEON-LABEL:test_D_16:
+;THUMB1-LABEL: test_D_16:
+;T1POST-LABEL: test_D_16:
+  define void @test_D_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.D, align 16
+    call void @use_D(%struct.D* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_E_1:
+;THUMB2-LABEL: test_E_1:
+;NO_NEON-LABEL:test_E_1:
+;THUMB1-LABEL: test_E_1:
+;T1POST-LABEL: test_E_1:
+  define void @test_E_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.E, align 1
+    call void @use_E(%struct.E* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_E_2:
+;THUMB2-LABEL: test_E_2:
+;NO_NEON-LABEL:test_E_2:
+;THUMB1-LABEL: test_E_2:
+;T1POST-LABEL: test_E_2:
+  define void @test_E_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.E, align 2
+    call void @use_E(%struct.E* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_E_4:
+;THUMB2-LABEL: test_E_4:
+;NO_NEON-LABEL:test_E_4:
+;THUMB1-LABEL: test_E_4:
+;T1POST-LABEL: test_E_4:
+  define void @test_E_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.E, align 4
+    call void @use_E(%struct.E* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_E_8:
+;THUMB2-LABEL: test_E_8:
+;NO_NEON-LABEL:test_E_8:
+;THUMB1-LABEL: test_E_8:
+;T1POST-LABEL: test_E_8:
+  define void @test_E_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.E, align 8
+    call void @use_E(%struct.E* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_E_16:
+;THUMB2-LABEL: test_E_16:
+;NO_NEON-LABEL:test_E_16:
+;THUMB1-LABEL: test_E_16:
+;T1POST-LABEL: test_E_16:
+  define void @test_E_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.E, align 16
+    call void @use_E(%struct.E* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_F_1:
+;THUMB2-LABEL: test_F_1:
+;NO_NEON-LABEL:test_F_1:
+;THUMB1-LABEL: test_F_1:
+;T1POST-LABEL: test_F_1:
+  define void @test_F_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.F, align 1
+    call void @use_F(%struct.F* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_F_2:
+;THUMB2-LABEL: test_F_2:
+;NO_NEON-LABEL:test_F_2:
+;THUMB1-LABEL: test_F_2:
+;T1POST-LABEL: test_F_2:
+  define void @test_F_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.F, align 2
+    call void @use_F(%struct.F* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_F_4:
+;THUMB2-LABEL: test_F_4:
+;NO_NEON-LABEL:test_F_4:
+;THUMB1-LABEL: test_F_4:
+;T1POST-LABEL: test_F_4:
+  define void @test_F_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.F, align 4
+    call void @use_F(%struct.F* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_F_8:
+;THUMB2-LABEL: test_F_8:
+;NO_NEON-LABEL:test_F_8:
+;THUMB1-LABEL: test_F_8:
+;T1POST-LABEL: test_F_8:
+  define void @test_F_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.F, align 8
+    call void @use_F(%struct.F* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_F_16:
+;THUMB2-LABEL: test_F_16:
+;NO_NEON-LABEL:test_F_16:
+;THUMB1-LABEL: test_F_16:
+;T1POST-LABEL: test_F_16:
+  define void @test_F_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.F, align 16
+    call void @use_F(%struct.F* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_G_1:
+;THUMB2-LABEL: test_G_1:
+;NO_NEON-LABEL:test_G_1:
+;THUMB1-LABEL: test_G_1:
+;T1POST-LABEL: test_G_1:
+  define void @test_G_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.G, align 1
+    call void @use_G(%struct.G* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_G_2:
+;THUMB2-LABEL: test_G_2:
+;NO_NEON-LABEL:test_G_2:
+;THUMB1-LABEL: test_G_2:
+;T1POST-LABEL: test_G_2:
+  define void @test_G_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.G, align 2
+    call void @use_G(%struct.G* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_G_4:
+;THUMB2-LABEL: test_G_4:
+;NO_NEON-LABEL:test_G_4:
+;THUMB1-LABEL: test_G_4:
+;T1POST-LABEL: test_G_4:
+  define void @test_G_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.G, align 4
+    call void @use_G(%struct.G* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_G_8:
+;THUMB2-LABEL: test_G_8:
+;NO_NEON-LABEL:test_G_8:
+;THUMB1-LABEL: test_G_8:
+;T1POST-LABEL: test_G_8:
+  define void @test_G_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.G, align 8
+    call void @use_G(%struct.G* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_G_16:
+;THUMB2-LABEL: test_G_16:
+;NO_NEON-LABEL:test_G_16:
+;THUMB1-LABEL: test_G_16:
+;T1POST-LABEL: test_G_16:
+  define void @test_G_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.G, align 16
+    call void @use_G(%struct.G* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_H_1:
+;THUMB2-LABEL: test_H_1:
+;NO_NEON-LABEL:test_H_1:
+;THUMB1-LABEL: test_H_1:
+;T1POST-LABEL: test_H_1:
+  define void @test_H_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.H, align 1
+    call void @use_H(%struct.H* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_H_2:
+;THUMB2-LABEL: test_H_2:
+;NO_NEON-LABEL:test_H_2:
+;THUMB1-LABEL: test_H_2:
+;T1POST-LABEL: test_H_2:
+  define void @test_H_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.H, align 2
+    call void @use_H(%struct.H* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_H_4:
+;THUMB2-LABEL: test_H_4:
+;NO_NEON-LABEL:test_H_4:
+;THUMB1-LABEL: test_H_4:
+;T1POST-LABEL: test_H_4:
+  define void @test_H_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.H, align 4
+    call void @use_H(%struct.H* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_H_8:
+;THUMB2-LABEL: test_H_8:
+;NO_NEON-LABEL:test_H_8:
+;THUMB1-LABEL: test_H_8:
+;T1POST-LABEL: test_H_8:
+  define void @test_H_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.H, align 8
+    call void @use_H(%struct.H* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_H_16:
+;THUMB2-LABEL: test_H_16:
+;NO_NEON-LABEL:test_H_16:
+;THUMB1-LABEL: test_H_16:
+;T1POST-LABEL: test_H_16:
+  define void @test_H_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.H, align 16
+    call void @use_H(%struct.H* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_I_1:
+;THUMB2-LABEL: test_I_1:
+;NO_NEON-LABEL:test_I_1:
+;THUMB1-LABEL: test_I_1:
+;T1POST-LABEL: test_I_1:
+  define void @test_I_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.I, align 1
+    call void @use_I(%struct.I* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_I_2:
+;THUMB2-LABEL: test_I_2:
+;NO_NEON-LABEL:test_I_2:
+;THUMB1-LABEL: test_I_2:
+;T1POST-LABEL: test_I_2:
+  define void @test_I_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.I, align 2
+    call void @use_I(%struct.I* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_I_4:
+;THUMB2-LABEL: test_I_4:
+;NO_NEON-LABEL:test_I_4:
+;THUMB1-LABEL: test_I_4:
+;T1POST-LABEL: test_I_4:
+  define void @test_I_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.I, align 4
+    call void @use_I(%struct.I* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_I_8:
+;THUMB2-LABEL: test_I_8:
+;NO_NEON-LABEL:test_I_8:
+;THUMB1-LABEL: test_I_8:
+;T1POST-LABEL: test_I_8:
+  define void @test_I_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.I, align 8
+    call void @use_I(%struct.I* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_I_16:
+;THUMB2-LABEL: test_I_16:
+;NO_NEON-LABEL:test_I_16:
+;THUMB1-LABEL: test_I_16:
+;T1POST-LABEL: test_I_16:
+  define void @test_I_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.I, align 16
+    call void @use_I(%struct.I* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_J_1:
+;THUMB2-LABEL: test_J_1:
+;NO_NEON-LABEL:test_J_1:
+;THUMB1-LABEL: test_J_1:
+;T1POST-LABEL: test_J_1:
+  define void @test_J_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.J, align 1
+    call void @use_J(%struct.J* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_J_2:
+;THUMB2-LABEL: test_J_2:
+;NO_NEON-LABEL:test_J_2:
+;THUMB1-LABEL: test_J_2:
+;T1POST-LABEL: test_J_2:
+  define void @test_J_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.J, align 2
+    call void @use_J(%struct.J* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_J_4:
+;THUMB2-LABEL: test_J_4:
+;NO_NEON-LABEL:test_J_4:
+;THUMB1-LABEL: test_J_4:
+;T1POST-LABEL: test_J_4:
+  define void @test_J_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.J, align 4
+    call void @use_J(%struct.J* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_J_8:
+;THUMB2-LABEL: test_J_8:
+;NO_NEON-LABEL:test_J_8:
+;THUMB1-LABEL: test_J_8:
+;T1POST-LABEL: test_J_8:
+  define void @test_J_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.J, align 8
+    call void @use_J(%struct.J* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_J_16:
+;THUMB2-LABEL: test_J_16:
+;NO_NEON-LABEL:test_J_16:
+;THUMB1-LABEL: test_J_16:
+;T1POST-LABEL: test_J_16:
+  define void @test_J_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.J, align 16
+    call void @use_J(%struct.J* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_K_1:
+;THUMB2-LABEL: test_K_1:
+;NO_NEON-LABEL:test_K_1:
+;THUMB1-LABEL: test_K_1:
+;T1POST-LABEL: test_K_1:
+  define void @test_K_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.K, align 1
+    call void @use_K(%struct.K* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_K_2:
+;THUMB2-LABEL: test_K_2:
+;NO_NEON-LABEL:test_K_2:
+;THUMB1-LABEL: test_K_2:
+;T1POST-LABEL: test_K_2:
+  define void @test_K_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.K, align 2
+    call void @use_K(%struct.K* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_K_4:
+;THUMB2-LABEL: test_K_4:
+;NO_NEON-LABEL:test_K_4:
+;THUMB1-LABEL: test_K_4:
+;T1POST-LABEL: test_K_4:
+  define void @test_K_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.K, align 4
+    call void @use_K(%struct.K* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_K_8:
+;THUMB2-LABEL: test_K_8:
+;NO_NEON-LABEL:test_K_8:
+;THUMB1-LABEL: test_K_8:
+;T1POST-LABEL: test_K_8:
+  define void @test_K_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.K, align 8
+    call void @use_K(%struct.K* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_K_16:
+;THUMB2-LABEL: test_K_16:
+;NO_NEON-LABEL:test_K_16:
+;THUMB1-LABEL: test_K_16:
+;T1POST-LABEL: test_K_16:
+  define void @test_K_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.K, align 16
+    call void @use_K(%struct.K* byval align 16 %a)
+    ret void
+  }
+;ARM-LABEL:    test_L_1:
+;THUMB2-LABEL: test_L_1:
+;NO_NEON-LABEL:test_L_1:
+;THUMB1-LABEL: test_L_1:
+;T1POST-LABEL: test_L_1:
+  define void @test_L_1() {
+;ARM:         ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;ARM:         bne
+
+;THUMB2:      ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;THUMB2:      bne
+
+;NO_NEON:     ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;NO_NEON:     bne
+
+;THUMB1:      ldrb    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #1
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrb    r{{[0-9]+}}, [{{.*}}], #1
+  entry:
+    %a = alloca %struct.L, align 1
+    call void @use_L(%struct.L* byval align 1 %a)
+    ret void
+  }
+;ARM-LABEL:    test_L_2:
+;THUMB2-LABEL: test_L_2:
+;NO_NEON-LABEL:test_L_2:
+;THUMB1-LABEL: test_L_2:
+;T1POST-LABEL: test_L_2:
+  define void @test_L_2() {
+;ARM:         ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;ARM:         bne
+
+;THUMB2:      ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;THUMB2:      bne
+
+;NO_NEON:     ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;NO_NEON:     bne
+
+;THUMB1:      ldrh    r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #2
+;THUMB1:      bne
+
+;T1POST-NOT:  ldrh    r{{[0-9]+}}, [{{.*}}], #2
+  entry:
+    %a = alloca %struct.L, align 2
+    call void @use_L(%struct.L* byval align 2 %a)
+    ret void
+  }
+;ARM-LABEL:    test_L_4:
+;THUMB2-LABEL: test_L_4:
+;NO_NEON-LABEL:test_L_4:
+;THUMB1-LABEL: test_L_4:
+;T1POST-LABEL: test_L_4:
+  define void @test_L_4() {
+;ARM:         ldr     r{{[0-9]+}}, [{{.*}}], #4
+;ARM:         bne
+
+;THUMB2:      ldr     r{{[0-9]+}}, [{{.*}}], #4
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  ldr     r{{[0-9]+}}, [{{.*}}], #4
+  entry:
+    %a = alloca %struct.L, align 4
+    call void @use_L(%struct.L* byval align 4 %a)
+    ret void
+  }
+;ARM-LABEL:    test_L_8:
+;THUMB2-LABEL: test_L_8:
+;NO_NEON-LABEL:test_L_8:
+;THUMB1-LABEL: test_L_8:
+;T1POST-LABEL: test_L_8:
+  define void @test_L_8() {
+;ARM:         vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.L, align 8
+    call void @use_L(%struct.L* byval align 8 %a)
+    ret void
+  }
+;ARM-LABEL:    test_L_16:
+;THUMB2-LABEL: test_L_16:
+;NO_NEON-LABEL:test_L_16:
+;THUMB1-LABEL: test_L_16:
+;T1POST-LABEL: test_L_16:
+  define void @test_L_16() {
+;ARM:         vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;ARM:         bne
+
+;THUMB2:      vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+;THUMB2:      bne
+
+;NO_NEON:     ldr     r{{[0-9]+}}, [{{.*}}], #4
+;NO_NEON:     bne
+;NO_NEON-NOT: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+
+;THUMB1:      ldr     r{{[0-9]+}}, {{\[}}[[BASE:r[0-9]+]]{{\]}}
+;THUMB1:      adds    [[BASE]], #4
+;THUMB1:      bne
+
+;T1POST-NOT:  vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{.*}}]!
+  entry:
+    %a = alloca %struct.L, align 16
+    call void @use_L(%struct.L* byval align 16 %a)
+    ret void
+  }
diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
index 1b411e3..19727da 100644
--- a/test/CodeGen/ARM/sub-cmp-peephole.ll
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -1,4 +1,7 @@
 ; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s --check-prefix=V7
+; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi | FileCheck %s -check-prefix=V8
+
 
 define i32 @f(i32 %a, i32 %b) nounwind ssp {
 entry:
@@ -84,3 +87,60 @@ land.lhs.true:                                    ; preds = %num2long.exit
 if.end11:                                         ; preds = %num2long.exit
   ret i32 23
 }
+
+define float @float_sel(i32 %a, i32 %b, float %x, float %y) {
+entry:
+; CHECK-LABEL: float_sel:
+; CHECK-NOT: cmp
+; V8-LABEL: float_sel:
+; V8-NOT: cmp
+; V8: vseleq.f32
+  %sub = sub i32 %a, %b
+  %cmp = icmp eq i32 %sub, 0
+  %ret = select i1 %cmp, float %x, float %y
+  ret float %ret
+}
+
+define double @double_sel(i32 %a, i32 %b, double %x, double %y) {
+entry:
+; CHECK-LABEL: double_sel:
+; CHECK-NOT: cmp
+; V8-LABEL: double_sel:
+; V8-NOT: cmp
+; V8: vseleq.f64
+  %sub = sub i32 %a, %b
+  %cmp = icmp eq i32 %sub, 0
+  %ret = select i1 %cmp, double %x, double %y
+  ret double %ret
+}
+
+@t = common global i32 0
+define double @double_sub(i32 %a, i32 %b, double %x, double %y) {
+entry:
+; CHECK-LABEL: double_sub:
+; CHECK: subs
+; CHECK-NOT: cmp
+; V8-LABEL: double_sub:
+; V8: vsel
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub i32 %a, %b
+  store i32 %sub, i32* @t
+  %ret = select i1 %cmp, double %x, double %y
+  ret double %ret
+}
+
+define double @double_sub_swap(i32 %a, i32 %b, double %x, double %y) {
+entry:
+; V7-LABEL: double_sub_swap:
+; V7-NOT: cmp
+; V7: subs
+; V8-LABEL: double_sub_swap:
+; V8-NOT: subs
+; V8: cmp
+; V8: vsel
+  %cmp = icmp sgt i32 %a, %b
+  %sub = sub i32 %b, %a
+  %ret = select i1 %cmp, double %x, double %y
+  store i32 %sub, i32* @t
+  ret double %ret
+}
diff --git a/test/CodeGen/ARM/swift-vldm.ll b/test/CodeGen/ARM/swift-vldm.ll
new file mode 100644
index 0000000..67ae00a
--- /dev/null
+++ b/test/CodeGen/ARM/swift-vldm.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s
+
+; Check that we avoid producing vldm instructions using d registers that
+; begin in the most-significant half of a q register. These require more
+; micro-ops on swift and so aren't worth combining.
+
+; CHECK-LABEL: test_vldm
+; CHECK: vldmia r{{[0-9]+}}, {d2, d3, d4}
+; CHECK-NOT: vldmia r{{[0-9]+}}, {d1, d2, d3, d4}
+
+declare fastcc void @force_register(double %d0, double %d1, double %d2, double %d3, double %d4) 
+
+define void @test_vldm(double* %x, double * %y) {
+entry:
+  %addr1 = getelementptr double * %x, i32 1
+  %addr2 = getelementptr double * %x, i32 2
+  %addr3 = getelementptr double * %x, i32 3
+  %d0 = load double * %y
+  %d1 = load double * %x
+  %d2 = load double * %addr1
+  %d3 = load double * %addr2
+  %d4 = load double * %addr3
+  ; We are trying to force x[0-3] in registers d1 to d4 so that we can test we
+  ; don't form a "vldmia rX, {d1, d2, d3, d4}".
+  ; We are relying on the calling convention and that register allocation
+  ; properly coalesces registers.
+  call fastcc void @force_register(double %d0, double %d1, double %d2, double %d3, double %d4)
+  ret void
+}
diff --git a/test/CodeGen/ARM/thumb2-it-block.ll b/test/CodeGen/ARM/thumb2-it-block.ll
index a25352c..47c5dcc 100644
--- a/test/CodeGen/ARM/thumb2-it-block.ll
+++ b/test/CodeGen/ARM/thumb2-it-block.ll
@@ -1,14 +1,15 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
 ; PR11107
 
 define i32 @test(i32 %a, i32 %b) {
 entry:
 ; CHECK:        cmp
 ; CHECK-NEXT:   it    mi
-; CHECK-NEXT:   rsbmi
+; CHECK-NEXT:   rsb{{s?}}mi
 ; CHECK-NEXT:   cmp
 ; CHECK-NEXT:   it    mi
-; CHECK-NEXT:   rsbmi
+; CHECK-NEXT:   rsb{{s?}}mi
  %cmp1 = icmp slt i32 %a, 0
  %sub1 = sub nsw i32 0, %a
  %abs1 = select i1 %cmp1, i32 %sub1, i32 %a
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index db88a03..6cb26e3 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -9,13 +9,13 @@
 ; RUN: llc -mtriple=armv7 -mattr=+nacl-trap -filetype=obj %s -o - \
 ; RUN:  | llvm-objdump -disassemble -triple armv7 -mattr=+nacl-trap - \
 ; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
-; RUN: llc -fast-isel -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
+; RUN: llc -verify-machineinstrs -fast-isel -mtriple=armv7-unknown-nacl -filetype=obj %s -o - \
 ; RUN:  | llvm-objdump -disassemble -triple armv7-unknown-nacl - \
 ; RUN:  | FileCheck %s -check-prefix=ENCODING-NACL
 ; RUN: llc -mtriple=armv7 -filetype=obj %s -o - \
 ; RUN:  | llvm-objdump -disassemble -triple armv7 - \
 ; RUN:  | FileCheck %s -check-prefix=ENCODING-ALL
-; RUN: llc -fast-isel -mtriple=armv7 -filetype=obj %s -o - \
+; RUN: llc -verify-machineinstrs -fast-isel -mtriple=armv7 -filetype=obj %s -o - \
 ; RUN:  | llvm-objdump -disassemble -triple armv7 - \
 ; RUN:  | FileCheck %s -check-prefix=ENCODING-ALL
 ; rdar://7961298
diff --git a/test/CodeGen/ARM/vadd.ll b/test/CodeGen/ARM/vadd.ll
index a1ad37b..fcb5408 100644
--- a/test/CodeGen/ARM/vadd.ll
+++ b/test/CodeGen/ARM/vadd.ll
@@ -90,37 +90,6 @@ define <4 x float> @vaddQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
 	ret <4 x float> %tmp3
 }
 
-define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vaddhni16:
-;CHECK: vaddhn.i16
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vaddhni32:
-;CHECK: vaddhn.i32
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: vaddhni64:
-;CHECK: vaddhn.i64
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-
 define <8 x i8> @vraddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: vraddhni16:
 ;CHECK: vraddhn.i16
@@ -152,6 +121,33 @@ declare <8 x i8>  @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind rea
 declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
 declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
 
+define <8 x i8> @vaddhni16_natural(<8 x i16> %A, <8 x i16> %B) nounwind {
+; CHECK-LABEL: vaddhni16_natural:
+; CHECK: vaddhn.i16
+  %sum = add <8 x i16> %A, %B
+  %shift = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %trunc = trunc <8 x i16> %shift to <8 x i8>
+  ret <8 x i8> %trunc
+}
+
+define <4 x i16> @vaddhni32_natural(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK-LABEL: vaddhni32_natural:
+; CHECK: vaddhn.i32
+  %sum = add <4 x i32> %A, %B
+  %shift = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+  %trunc = trunc <4 x i32> %shift to <4 x i16>
+  ret <4 x i16> %trunc
+}
+
+define <2 x i32> @vaddhni64_natural(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK-LABEL: vaddhni64_natural:
+; CHECK: vaddhn.i64
+  %sum = add <2 x i64> %A, %B
+  %shift = lshr <2 x i64> %sum, <i64 32, i64 32>
+  %trunc = trunc <2 x i64> %shift to <2 x i32>
+  ret <2 x i32> %trunc
+}
+
 define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vaddls8:
 ;CHECK: vaddl.s8
diff --git a/test/CodeGen/ARM/vector-DAGCombine.ll b/test/CodeGen/ARM/vector-DAGCombine.ll
index 4221c98..759da22 100644
--- a/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -29,7 +29,7 @@ entry:
 
 ; Radar 8407927: Make sure that VMOVRRD gets optimized away when the result is
 ; converted back to be used as a vector type.
-; CHECK: test_vmovrrd_combine
+; CHECK-LABEL: test_vmovrrd_combine:
 define <4 x i32> @test_vmovrrd_combine() nounwind {
 entry:
   br i1 undef, label %bb1, label %bb2
@@ -136,7 +136,7 @@ define i16 @foldBuildVectors() {
 
 ; Test that we are generating vrev and vext for reverse shuffles of v8i16
 ; shuffles.
-; CHECK: reverse_v8i16
+; CHECK-LABEL: reverse_v8i16:
 define void @reverse_v8i16(<8 x i16>* %loadaddr, <8 x i16>* %storeaddr) {
   %v0 = load <8 x i16>* %loadaddr
   ; CHECK: vrev64.16
@@ -149,7 +149,7 @@ define void @reverse_v8i16(<8 x i16>* %loadaddr, <8 x i16>* %storeaddr) {
 
 ; Test that we are generating vrev and vext for reverse shuffles of v16i8
 ; shuffles.
-; CHECK: reverse_v16i8
+; CHECK-LABEL: reverse_v16i8:
 define void @reverse_v16i8(<16 x i8>* %loadaddr, <16 x i8>* %storeaddr) {
   %v0 = load <16 x i8>* %loadaddr
   ; CHECK: vrev64.8
@@ -165,7 +165,7 @@ define void @reverse_v16i8(<16 x i8>* %loadaddr, <16 x i8>* %storeaddr) {
 ; vldr cannot handle unaligned loads.
 ; Fall back to vld1.32, which can, instead of using the general purpose loads
 ; followed by a costly sequence of instructions to build the vector register.
-; CHECK: t3
+; CHECK-LABEL: t3:
 ; CHECK: vld1.32 {[[REG:d[0-9]+]][0]}
 ; CHECK: vld1.32 {[[REG]][1]}
 ; CHECK: vmull.u8 q{{[0-9]+}}, [[REG]], [[REG]]
@@ -188,7 +188,7 @@ declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>)
 ; Check that (insert_vector_elt (load)) => (vector_load).
 ; Thus, check that scalar_to_vector do not interfer with that.
 define <8 x i16> @t4(i8* nocapture %sp0) {
-; CHECK: t4
+; CHECK-LABEL: t4:
 ; CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r0]
 entry:
   %pix_sp0.0.cast = bitcast i8* %sp0 to i32*
@@ -202,7 +202,7 @@ entry:
 ; Make sure vector load is used for all three loads.
 ; Lowering to build vector was breaking the single use property of the load of
 ;  %pix_sp0.0.copyload.
-; CHECK: t5
+; CHECK-LABEL: t5:
 ; CHECK: vld1.32 {[[REG1:d[0-9]+]][1]}, [r0]
 ; CHECK: vorr [[REG2:d[0-9]+]], [[REG1]], [[REG1]]
 ; CHECK: vld1.32 {[[REG1]][0]}, [r1]
@@ -224,3 +224,23 @@ entry:
   %vmull.i = tail call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %0, <8 x i8> %1)
   ret <8 x i16> %vmull.i
 }
+
+; <rdar://problem/14989896> Make sure we manage to truncate a vector from an
+; illegal type to a legal type.
+define <2 x i8> @test_truncate(<2 x i128> %in) {
+; CHECK-LABEL: test_truncate:
+; CHECK: mov [[BASE:r[0-9]+]], sp
+; CHECK-NEXT: vld1.32 {[[REG1:d[0-9]+]][0]}, {{\[}}[[BASE]]:32]
+; CHECK-NEXT: add [[BASE2:r[0-9]+]], [[BASE]], #4
+; CHECK-NEXT: vld1.32 {[[REG1]][1]}, {{\[}}[[BASE2]]:32]
+; REG2 Should map on the same Q register as REG1, i.e., REG2 = REG1 - 1, but we
+; cannot express that.
+; CHECK-NEXT: vmov.32 [[REG2:d[0-9]+]][0], r0
+; CHECK-NEXT: vmov.32 [[REG2]][1], r1
+; The Q register used here should match floor(REG1/2), but we cannot express that.
+; CHECK-NEXT: vmovn.i64 [[RES:d[0-9]+]], q{{[0-9]+}}
+; CHECK-NEXT: vmov r0, r1, [[RES]]
+entry:
+  %res = trunc <2 x i128> %in to <2 x i8>
+  ret <2 x i8> %res
+}
diff --git a/test/CodeGen/ARM/vldm-liveness.ll b/test/CodeGen/ARM/vldm-liveness.ll
new file mode 100644
index 0000000..751f447
--- /dev/null
+++ b/test/CodeGen/ARM/vldm-liveness.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple thumbv7-apple-ios -verify-machineinstrs -o - %s | FileCheck %s
+
+; ARM load store optimizer was dealing with a sequence like:
+;     s1 = VLDRS [r0, 1], Q0<imp-def>
+;     s3 = VLDRS [r0, 2], Q0<imp-use,kill>, Q0<imp-def>
+;     s0 = VLDRS [r0, 0], Q0<imp-use,kill>, Q0<imp-def>
+;     s2 = VLDRS [r0, 4], Q0<imp-use,kill>, Q0<imp-def>
+;
+; It decided to combine the {s0, s1} loads into a single instruction in the
+; third position. However, this leaves the instruction defining s3 with a stray
+; imp-use of Q0, which is undefined.
+;
+; The verifier catches this, so this test just makes sure that appropriate
+; liveness flags are added.
+;
+; I believe the change will be tested as long as the vldmia is not the first of
+; the loads. Earlier optimisations may perturb the output over time, but
+; fiddling the indices should be sufficient to restore the test.
+
+define arm_aapcs_vfpcc <4 x float> @foo(float* %ptr) {
+; CHECK-LABEL: foo:
+; CHECK: vldr s3, [r0, #8]
+; CHECK: vldmia r0, {s0, s1}
+; CHECK: vldr s2, [r0, #16]
+   %off0 = getelementptr float* %ptr, i32 0
+   %val0 = load float* %off0
+   %off1 = getelementptr float* %ptr, i32 1
+   %val1 = load float* %off1
+   %off4 = getelementptr float* %ptr, i32 4
+   %val4 = load float* %off4
+   %off2 = getelementptr float* %ptr, i32 2
+   %val2 = load float* %off2
+
+   %vec1 = insertelement <4 x float> undef, float %val0, i32 0
+   %vec2 = insertelement <4 x float> %vec1, float %val1, i32 1
+   %vec3 = insertelement <4 x float> %vec2, float %val4, i32 2
+   %vec4 = insertelement <4 x float> %vec3, float %val2, i32 3
+
+   ret <4 x float> %vec4
+}
diff --git a/test/CodeGen/ARM/vldm-sched-a9.ll b/test/CodeGen/ARM/vldm-sched-a9.ll
new file mode 100644
index 0000000..d0a9ac6
--- /dev/null
+++ b/test/CodeGen/ARM/vldm-sched-a9.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
+
+; This test will generate spills/fills using vldmia instructions that access 64 bytes of memory.
+; Check that we don't crash when we generate these instructions on Cortex-A9.
+
+; CHECK: test:
+; CHECK: vstmia
+; CHECK: vldmia
+define void @test(i64* %src) #0 {
+entry:
+  %arrayidx39 = getelementptr inbounds i64* %src, i32 13
+  %vecinit285 = shufflevector <16 x i64> undef, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+  store <16 x i64> %vecinit285, <16 x i64>* undef, align 128
+  %0 = load i64* undef, align 8
+  %vecinit379 = insertelement <16 x i64> undef, i64 %0, i32 9
+  %1 = load i64* undef, align 8
+  %vecinit419 = insertelement <16 x i64> undef, i64 %1, i32 15
+  store <16 x i64> %vecinit419, <16 x i64>* undef, align 128
+  %vecinit579 = insertelement <16 x i64> undef, i64 0, i32 4
+  %vecinit582 = shufflevector <16 x i64> %vecinit579, <16 x i64> <i64 6, i64 7, i64 8, i64 9, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit584 = insertelement <16 x i64> %vecinit582, i64 undef, i32 9
+  %vecinit586 = insertelement <16 x i64> %vecinit584, i64 0, i32 10
+  %vecinit589 = shufflevector <16 x i64> %vecinit586, <16 x i64> <i64 12, i64 13, i64 14, i64 15, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 16, i32 17, i32 18, i32 19, i32 undef>
+  %2 = load i64* undef, align 8
+  %vecinit591 = insertelement <16 x i64> %vecinit589, i64 %2, i32 15
+  store <16 x i64> %vecinit591, <16 x i64>* undef, align 128
+  %vecinit694 = shufflevector <16 x i64> undef, <16 x i64> <i64 13, i64 14, i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+  store <16 x i64> %vecinit694, <16 x i64>* undef, align 128
+  %3 = load i64* undef, align 8
+  %vecinit1331 = insertelement <16 x i64> undef, i64 %3, i32 14
+  %4 = load i64* undef, align 8
+  %vecinit1468 = insertelement <16 x i64> undef, i64 %4, i32 11
+  %vecinit1471 = shufflevector <16 x i64> %vecinit1468, <16 x i64> <i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 undef, i32 undef>
+  %vecinit1474 = shufflevector <16 x i64> %vecinit1471, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+  store <16 x i64> %vecinit1474, <16 x i64>* undef, align 128
+  %vecinit1552 = shufflevector <16 x i64> undef, <16 x i64> <i64 10, i64 11, i64 12, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit1555 = shufflevector <16 x i64> %vecinit1552, <16 x i64> <i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 undef, i32 undef>
+  %vecinit1558 = shufflevector <16 x i64> %vecinit1555, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+  store <16 x i64> %vecinit1558, <16 x i64>* undef, align 128
+  %vecinit1591 = shufflevector <16 x i64> undef, <16 x i64> <i64 3, i64 4, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit1594 = shufflevector <16 x i64> %vecinit1591, <16 x i64> <i64 5, i64 6, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit1597 = shufflevector <16 x i64> %vecinit1594, <16 x i64> <i64 7, i64 8, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit1599 = insertelement <16 x i64> %vecinit1597, i64 undef, i32 8
+  %vecinit1601 = insertelement <16 x i64> %vecinit1599, i64 undef, i32 9
+  %vecinit1603 = insertelement <16 x i64> %vecinit1601, i64 undef, i32 10
+  %5 = load i64* undef, align 8
+  %vecinit1605 = insertelement <16 x i64> %vecinit1603, i64 %5, i32 11
+  %vecinit1608 = shufflevector <16 x i64> %vecinit1605, <16 x i64> <i64 13, i64 14, i64 15, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 undef>
+  %6 = load i64* undef, align 8
+  %vecinit1610 = insertelement <16 x i64> %vecinit1608, i64 %6, i32 15
+  store <16 x i64> %vecinit1610, <16 x i64>* undef, align 128
+  %vecinit2226 = shufflevector <16 x i64> undef, <16 x i64> <i64 6, i64 7, i64 8, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %7 = load i64* undef, align 8
+  %vecinit2228 = insertelement <16 x i64> %vecinit2226, i64 %7, i32 8
+  %vecinit2230 = insertelement <16 x i64> %vecinit2228, i64 undef, i32 9
+  %vecinit2233 = shufflevector <16 x i64> %vecinit2230, <16 x i64> <i64 11, i64 12, i64 13, i64 14, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 18, i32 19, i32 undef, i32 undef>
+  %vecinit2236 = shufflevector <16 x i64> %vecinit2233, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+  store <16 x i64> %vecinit2236, <16 x i64>* undef, align 128
+  %vecinit2246 = shufflevector <16 x i64> undef, <16 x i64> <i64 4, i64 5, i64 6, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit2249 = shufflevector <16 x i64> %vecinit2246, <16 x i64> <i64 7, i64 8, i64 9, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 18, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit2252 = shufflevector <16 x i64> %vecinit2249, <16 x i64> <i64 10, i64 11, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vecinit2255 = shufflevector <16 x i64> %vecinit2252, <16 x i64> <i64 12, i64 13, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 16, i32 17, i32 undef, i32 undef, i32 undef>
+  %8 = load i64* %arrayidx39, align 8
+  %vecinit2257 = insertelement <16 x i64> %vecinit2255, i64 %8, i32 13
+  %vecinit2260 = shufflevector <16 x i64> %vecinit2257, <16 x i64> <i64 15, i64 16, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+  store <16 x i64> %vecinit2260, <16 x i64>* null, align 128
+  ret void
+}
+attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/ARM/vminmaxnm.ll b/test/CodeGen/ARM/vminmaxnm.ll
index afa73b9..f6ce64c 100644
--- a/test/CodeGen/ARM/vminmaxnm.ll
+++ b/test/CodeGen/ARM/vminmaxnm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple armv8 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-FAST
 
 define <4 x float> @vmaxnmq(<4 x float>* %A, <4 x float>* %B) nounwind {
 ; CHECK: vmaxnmq
@@ -36,6 +37,51 @@ define <2 x float> @vminnmd(<2 x float>* %A, <2 x float>* %B) nounwind {
   ret <2 x float> %tmp3
 }
 
+define float @fp-armv8_vminnm_o(float %a, float %b) {
+; CHECK-FAST: fp-armv8_vminnm_o
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f32
+; CHECK: fp-armv8_vminnm_o
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp olt float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vminnm_u(float %a, float %b) {
+; CHECK-FAST: fp-armv8_vminnm_u
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vminnm.f32
+; CHECK: fp-armv8_vminnm_u
+; CHECK-NOT: vminnm.f32
+  %cmp = fcmp ult float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vmaxnm_o(float %a, float %b) {
+; CHECK-FAST: fp-armv8_vmaxnm_o
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f32
+; CHECK: fp-armv8_vmaxnm_o
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp ogt float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+define float @fp-armv8_vmaxnm_u(float %a, float %b) {
+; CHECK-FAST: fp-armv8_vmaxnm_u
+; CHECK-FAST-NOT: vcmp
+; CHECK-FAST: vmaxnm.f32
+; CHECK: fp-armv8_vmaxnm_u
+; CHECK-NOT: vmaxnm.f32
+  %cmp = fcmp ugt float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+
+
 declare <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index 6210ad3..de329ac 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -515,6 +515,17 @@ entry:
   ret void
 }
 
+define <8 x i8> @no_distribute(<8 x i8> %a, <8 x i8> %b) nounwind {
+entry:
+; CHECK: no_distribute
+; CHECK: vadd.i8
+; CHECK: vmul.i8
+; CHECK-NOT: vmla.i8
+  %0 = add <8 x i8> %a, %b
+  %1 = mul <8x i8> %0, %0
+  ret <8 x i8> %1
+}
+
 ; If one operand has a zero-extend and the other a sign-extend, vmull
 ; cannot be used.
 define i16 @vmullWithInconsistentExtensions(<8 x i8> %vec) {
@@ -623,3 +634,21 @@ entry:
   store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4
   ret void
 }
+
+define void @foo(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
+;   Look for doing a normal scalar FP load rather than an to-all-lanes load.
+;   e.g., "ldr s0, [r2]" rathern than "vld1.32  {d18[], d19[]}, [r2:32]"
+;   Then check that the vector multiply has folded the splat to all lanes
+;   and used a vector * scalar instruction.
+; CHECK: vldr  {{s[0-9]+}}, [r2]
+; CHECK: vmul.f32  q8, q8, d0[0]
+  %tmp = load float* %src, align 4
+  %tmp5 = load <4 x float>* %a, align 4
+  %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1
+  %tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2
+  %tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3
+  %tmp10 = fmul <4 x float> %tmp9, %tmp5
+  store <4 x float> %tmp10, <4 x float>* %dst, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM/vqdmul.ll b/test/CodeGen/ARM/vqdmul.ll
index a28cae9..d298167 100644
--- a/test/CodeGen/ARM/vqdmul.ll
+++ b/test/CodeGen/ARM/vqdmul.ll
@@ -197,84 +197,92 @@ entry:
 declare <4 x i32>  @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
 declare <2 x i64>  @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 
-define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
-;CHECK-LABEL: vqdmlals16:
+define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: vqdmlals16_natural:
 ;CHECK: vqdmlal.s16
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = load <4 x i16>* %C
-	%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
-	ret <4 x i32> %tmp4
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i16>* %C
+        %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
+        %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
+        ret <4 x i32> %tmp5
 }
 
-define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
-;CHECK-LABEL: vqdmlals32:
+define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: vqdmlals32_natural:
 ;CHECK: vqdmlal.s32
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = load <2 x i32>* %C
-	%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
-	ret <2 x i64> %tmp4
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i32>* %C
+        %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
+        %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
+        ret <2 x i64> %tmp5
 }
 
-define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
 entry:
-; CHECK: test_vqdmlal_lanes16
+; CHECK-LABEL: test_vqdmlal_lanes16_natural:
 ; CHECK: vqdmlal.s16 q0, d2, d3[1]
   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
-  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %1
+  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
+  %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
+  ret <4 x i32> %2
 }
 
-define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
 entry:
-; CHECK: test_vqdmlal_lanes32
+; CHECK-LABEL: test_vqdmlal_lanes32_natural:
 ; CHECK: vqdmlal.s32 q0, d2, d3[1]
   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
-  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %1
+  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
+  %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
+  ret <2 x i64> %2
 }
 
-declare <4 x i32>  @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64>  @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32>  @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64>  @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
-;CHECK-LABEL: vqdmlsls16:
+define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: vqdmlsls16_natural:
 ;CHECK: vqdmlsl.s16
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = load <4 x i16>* %C
-	%tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3)
-	ret <4 x i32> %tmp4
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i16>* %C
+        %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
+        %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
+        ret <4 x i32> %tmp5
 }
 
-define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
-;CHECK-LABEL: vqdmlsls32:
+define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: vqdmlsls32_natural:
 ;CHECK: vqdmlsl.s32
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = load <2 x i32>* %C
-	%tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3)
-	ret <2 x i64> %tmp4
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i32>* %C
+        %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
+        %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
+        ret <2 x i64> %tmp5
 }
 
-define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
+define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone {
 entry:
-; CHECK: test_vqdmlsl_lanes16
+; CHECK-LABEL: test_vqdmlsl_lanes16_natural:
 ; CHECK: vqdmlsl.s16 q0, d2, d3[1]
   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
-  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %1
+  %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
+  %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
+  ret <4 x i32> %2
 }
 
-define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
+define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone {
 entry:
-; CHECK: test_vqdmlsl_lanes32
+; CHECK-LABEL: test_vqdmlsl_lanes32_natural:
 ; CHECK: vqdmlsl.s32 q0, d2, d3[1]
   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
-  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %1
+  %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
+  %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
+  ret <2 x i64> %2
 }
 
-declare <4 x i32>  @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64>  @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32>  @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64>  @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vsel.ll b/test/CodeGen/ARM/vsel.ll
new file mode 100644
index 0000000..7e1f714
--- /dev/null
+++ b/test/CodeGen/ARM/vsel.ll
@@ -0,0 +1,309 @@
+; RUN: llc < %s -mtriple=armv8-linux-gnueabihf -mattr=+fp-armv8 -float-abi=hard | FileCheck %s
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+define void @test_vsel32sgt(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32sgt
+  %tst1 = icmp sgt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f32 s0, s0, s1
+  ret void
+}
+define void @test_vsel64sgt(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64sgt
+  %tst1 = icmp sgt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f64 d16, d0, d1
+  ret void
+}
+define void @test_vsel32sge(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32sge
+  %tst1 = icmp sge i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselge.f32 s0, s0, s1
+  ret void
+}
+define void @test_vsel64sge(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64sge
+  %tst1 = icmp sge i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselge.f64 d16, d0, d1
+  ret void
+}
+define void @test_vsel32eq(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32eq
+  %tst1 = icmp eq i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vseleq.f32 s0, s0, s1
+  ret void
+}
+define void @test_vsel64eq(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64eq
+  %tst1 = icmp eq i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vseleq.f64 d16, d0, d1
+  ret void
+}
+define void @test_vsel32slt(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32slt
+  %tst1 = icmp slt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f32 s0, s1, s0
+  ret void
+}
+define void @test_vsel64slt(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64slt
+  %tst1 = icmp slt i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselgt.f64 d16, d1, d0
+  ret void
+}
+define void @test_vsel32sle(i32 %lhs32, i32 %rhs32, float %a, float %b) {
+; CHECK: test_vsel32sle
+  %tst1 = icmp sle i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: cmp r0, r1
+; CHECK: vselge.f32 s0, s1, s0
+  ret void
+}
+define void @test_vsel64sle(i32 %lhs32, i32 %rhs32, double %a, double %b) {
+; CHECK: test_vsel64sle
+  %tst1 = icmp sle i32 %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: cmp r0, r1
+; CHECK: vselge.f64 d16, d1, d0
+  ret void
+}
+define void @test_vsel32ogt(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ogt
+  %tst1 = fcmp ogt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64ogt(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ogt
+  %tst1 = fcmp ogt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32oge(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32oge
+  %tst1 = fcmp oge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64oge(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64oge
+  %tst1 = fcmp oge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32oeq(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32oeq
+  %tst1 = fcmp oeq float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64oeq(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64oeq
+  %tst1 = fcmp oeq float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32ugt(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ugt
+  %tst1 = fcmp ugt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ugt(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ugt
+  %tst1 = fcmp ugt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32uge(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32uge
+  %tst1 = fcmp uge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64uge(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64uge
+  %tst1 = fcmp uge float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32olt(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32olt
+  %tst1 = fcmp olt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64olt(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64olt
+  %tst1 = fcmp olt float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselgt.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32ult(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ult
+  %tst1 = fcmp ult float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ult(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ult
+  %tst1 = fcmp ult float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselge.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32ole(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ole
+  %tst1 = fcmp ole float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64ole(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ole
+  %tst1 = fcmp ole float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s1, s0
+; CHECK: vselge.f64 d16, d1, d2
+  ret void
+}
+define void @test_vsel32ule(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ule
+  %tst1 = fcmp ule float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ule(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ule
+  %tst1 = fcmp ule float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselgt.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32ord(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32ord
+  %tst1 = fcmp ord float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64ord(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64ord
+  %tst1 = fcmp ord float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32une(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32une
+  %tst1 = fcmp une float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f32 s0, s3, s2
+  ret void
+}
+define void @test_vsel64une(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64une
+  %tst1 = fcmp une float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vseleq.f64 d16, d2, d1
+  ret void
+}
+define void @test_vsel32uno(float %lhs32, float %rhs32, float %a, float %b) {
+; CHECK: test_vsel32uno
+  %tst1 = fcmp uno float %lhs32, %rhs32
+  %val1 = select i1 %tst1, float %a, float %b
+  store float %val1, float* @varfloat
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f32 s0, s2, s3
+  ret void
+}
+define void @test_vsel64uno(float %lhs32, float %rhs32, double %a, double %b) {
+; CHECK: test_vsel64uno
+  %tst1 = fcmp uno float %lhs32, %rhs32
+  %val1 = select i1 %tst1, double %a, double %b
+  store double %val1, double* @vardouble
+; CHECK: vcmpe.f32 s0, s1
+; CHECK: vselvs.f64 d16, d1, d2
+  ret void
+}
diff --git a/test/CodeGen/ARM/vstlane.ll b/test/CodeGen/ARM/vstlane.ll
index 651b6d5..34c5c70 100644
--- a/test/CodeGen/ARM/vstlane.ll
+++ b/test/CodeGen/ARM/vstlane.ll
@@ -13,7 +13,7 @@ define void @vst1lanei8(i8* %A, <8 x i8>* %B) nounwind {
 ;Check for a post-increment updating store.
 define void @vst1lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vst1lanei8_update:
-;CHECK: vst1.8 {d16[3]}, [r2]!
+;CHECK: vst1.8 {d16[3]}, [{{r[0-9]}}]!
 	%A = load i8** %ptr
 	%tmp1 = load <8 x i8>* %B
 	%tmp2 = extractelement <8 x i8> %tmp1, i32 3
diff --git a/test/CodeGen/ARM/vsub.ll b/test/CodeGen/ARM/vsub.ll
index 89c3095..6b95b97 100644
--- a/test/CodeGen/ARM/vsub.ll
+++ b/test/CodeGen/ARM/vsub.ll
@@ -90,37 +90,33 @@ define <4 x float> @vsubQf32(<4 x float>* %A, <4 x float>* %B) nounwind {
 	ret <4 x float> %tmp3
 }
 
-define <8 x i8> @vsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: vsubhni16:
-;CHECK: vsubhn.i16
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i8> %tmp3
+define <8 x i8> @vsubhni16_natural(<8 x i16> %A, <8 x i16> %B) nounwind {
+; CHECK-LABEL: vsubhni16_natural:
+; CHECK: vsubhn.i16
+  %sum = sub <8 x i16> %A, %B
+  %shift = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %trunc = trunc <8 x i16> %shift to <8 x i8>
+  ret <8 x i8> %trunc
 }
 
-define <4 x i16> @vsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: vsubhni32:
-;CHECK: vsubhn.i32
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i16> %tmp3
+define <4 x i16> @vsubhni32_natural(<4 x i32> %A, <4 x i32> %B) nounwind {
+; CHECK-LABEL: vsubhni32_natural:
+; CHECK: vsubhn.i32
+  %sum = sub <4 x i32> %A, %B
+  %shift = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+  %trunc = trunc <4 x i32> %shift to <4 x i16>
+  ret <4 x i16> %trunc
 }
 
-define <2 x i32> @vsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: vsubhni64:
-;CHECK: vsubhn.i64
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i32> %tmp3
+define <2 x i32> @vsubhni64_natural(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK-LABEL: vsubhni64_natural:
+; CHECK: vsubhn.i64
+  %sum = sub <2 x i64> %A, %B
+  %shift = lshr <2 x i64> %sum, <i64 32, i64 32>
+  %trunc = trunc <2 x i64> %shift to <2 x i32>
+  ret <2 x i32> %trunc
 }
 
-declare <8 x i8>  @llvm.arm.neon.vsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-
 define <8 x i8> @vrsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: vrsubhni16:
 ;CHECK: vrsubhn.i16
diff --git a/test/CodeGen/CPP/lit.local.cfg b/test/CodeGen/CPP/lit.local.cfg
index 4d4b4a4..4063dd1 100644
--- a/test/CodeGen/CPP/lit.local.cfg
+++ b/test/CodeGen/CPP/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'CppBackend' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll b/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
index 6281ada..3f17ce1 100644
--- a/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
+++ b/test/CodeGen/Generic/2009-03-17-LSR-APInt.ll
@@ -63,30 +63,58 @@ bb47:		; preds = %bb46, %bb44
 	br label %bb44
 }
 
-declare i32 @pthread_once(i32*, void ()*)
+define i32 @pthread_once(i32*, void ()*) {
+  ret i32 0
+}
 
-declare i8* @pthread_getspecific(i32)
+define i8* @pthread_getspecific(i32) {
+  ret i8* null
+}
 
-declare i32 @pthread_setspecific(i32, i8*)
+define i32 @pthread_setspecific(i32, i8*) {
+  ret i32 0
+}
 
-declare i32 @pthread_create(i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*)
+define i32 @pthread_create(i64*, %struct.pthread_attr_t*, i8* (i8*)*, i8*) {
+  ret i32 0
+}
 
-declare i32 @pthread_cancel(i64)
+define i32 @pthread_cancel(i64) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutex_lock(%struct.pthread_mutex_t*)
+define i32 @pthread_mutex_lock(%struct.pthread_mutex_t*) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutex_trylock(%struct.pthread_mutex_t*)
+define i32 @pthread_mutex_trylock(%struct.pthread_mutex_t*) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutex_unlock(%struct.pthread_mutex_t*)
+define i32 @pthread_mutex_unlock(%struct.pthread_mutex_t*) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutex_init(%struct.pthread_mutex_t*, %struct.Alignment*)
+define i32 @pthread_mutex_init(%struct.pthread_mutex_t*, %struct.Alignment*) {
+  ret i32 0
+}
 
-declare i32 @pthread_key_create(i32*, void (i8*)*)
+define i32 @pthread_key_create(i32*, void (i8*)*) {
+  ret i32 0
+}
 
-declare i32 @pthread_key_delete(i32)
+define i32 @pthread_key_delete(i32) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutexattr_init(%struct.Alignment*)
+define i32 @pthread_mutexattr_init(%struct.Alignment*) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutexattr_settype(%struct.Alignment*, i32)
+define i32 @pthread_mutexattr_settype(%struct.Alignment*, i32) {
+  ret i32 0
+}
 
-declare i32 @pthread_mutexattr_destroy(%struct.Alignment*)
+define i32 @pthread_mutexattr_destroy(%struct.Alignment*) {
+  ret i32 0
+}
diff --git a/test/CodeGen/Generic/crash.ll b/test/CodeGen/Generic/crash.ll
index d3fc204..8de6b0d 100644
--- a/test/CodeGen/Generic/crash.ll
+++ b/test/CodeGen/Generic/crash.ll
@@ -23,7 +23,7 @@ bb32:                                             ; preds = %bb6
 %3 = load double* %1, align 4
 %4 = load double* %0, align 4
 call void @Parse_Vector(double* %0) nounwind
-%5 = call i32 @llvm.objectsize.i32(i8* undef, i1 false)
+%5 = call i32 @llvm.objectsize.i32.p0i8(i8* undef, i1 false)
 %6 = icmp eq i32 %5, -1
 br i1 %6, label %bb34, label %bb33
 
@@ -36,7 +36,7 @@ unreachable
 }
 
 declare void @Parse_Vector(double*)
-declare i32 @llvm.objectsize.i32(i8*, i1)
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1)
 
 
 ; PR9578
diff --git a/test/CodeGen/Generic/lit.local.cfg b/test/CodeGen/Generic/lit.local.cfg
deleted file mode 100644
index 19eebc0..0000000
--- a/test/CodeGen/Generic/lit.local.cfg
+++ /dev/null
@@ -1 +0,0 @@
-config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/CodeGen/Hexagon/BranchPredict.ll b/test/CodeGen/Hexagon/BranchPredict.ll
index 716e85d..4ab1966 100644
--- a/test/CodeGen/Hexagon/BranchPredict.ll
+++ b/test/CodeGen/Hexagon/BranchPredict.ll
@@ -53,7 +53,7 @@ return:                                           ; preds = %if.else, %if.then
 define i32 @foo_bar(i32 %a, i16 signext %b) nounwind {
 ; CHECK: if{{ *}}(!cmp.eq(r{{[0-9]*}}.new, #0)) jump:nt
 entry:
-  %0 = load i32* @j, align 4, !tbaa !2
+  %0 = load i32* @j, align 4
   %tobool = icmp eq i32 %0, 0
   br i1 %tobool, label %if.else, label %if.then, !prof !0
 
@@ -74,6 +74,3 @@ return:                                           ; preds = %if.else, %if.then
 
 !0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
 !1 = metadata !{metadata !"branch_weights", i32 4, i32 64}
-!2 = metadata !{metadata !"int", metadata !3}
-!3 = metadata !{metadata !"omnipotent char", metadata !4}
-!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/combine_ir.ll b/test/CodeGen/Hexagon/combine_ir.ll
index 8b99ef7..e100cf7 100644
--- a/test/CodeGen/Hexagon/combine_ir.ll
+++ b/test/CodeGen/Hexagon/combine_ir.ll
@@ -4,7 +4,7 @@
 
 define void @word(i32* nocapture %a) nounwind {
 entry:
-  %0 = load i32* %a, align 4, !tbaa !0
+  %0 = load i32* %a, align 4
   %1 = zext i32 %0 to i64
   tail call void @bar(i64 %1) nounwind
   ret void
@@ -17,10 +17,10 @@ declare void @bar(i64)
 
 define void @halfword(i16* nocapture %a) nounwind {
 entry:
-  %0 = load i16* %a, align 2, !tbaa !3
+  %0 = load i16* %a, align 2
   %1 = zext i16 %0 to i64
   %add.ptr = getelementptr inbounds i16* %a, i32 1
-  %2 = load i16* %add.ptr, align 2, !tbaa !3
+  %2 = load i16* %add.ptr, align 2
   %3 = zext i16 %2 to i64
   %4 = shl nuw nsw i64 %3, 16
   %ins = or i64 %4, %1
@@ -33,18 +33,13 @@ entry:
 
 define void @byte(i8* nocapture %a) nounwind {
 entry:
-  %0 = load i8* %a, align 1, !tbaa !1
+  %0 = load i8* %a, align 1
   %1 = zext i8 %0 to i64
   %add.ptr = getelementptr inbounds i8* %a, i32 1
-  %2 = load i8* %add.ptr, align 1, !tbaa !1
+  %2 = load i8* %add.ptr, align 1
   %3 = zext i8 %2 to i64
   %4 = shl nuw nsw i64 %3, 8
   %ins = or i64 %4, %1
   tail call void @bar(i64 %ins) nounwind
   ret void
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/Hexagon/hwloop-dbg.ll b/test/CodeGen/Hexagon/hwloop-dbg.ll
index fce6d19..bfdd813 100644
--- a/test/CodeGen/Hexagon/hwloop-dbg.ll
+++ b/test/CodeGen/Hexagon/hwloop-dbg.ll
@@ -35,13 +35,14 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!29}
 
 !0 = metadata !{i32 786449, metadata !28, i32 12, metadata !"QuIC LLVM Hexagon Clang version 6.1-pre-unknown, (git://git-hexagon-aus.quicinc.com/llvm/clang-mainline.git e9382867661454cdf44addb39430741578e9765c) (llvm/llvm-mainline.git 36412bb1fcf03ed426d4437b41198bae066675ac)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [/usr2/kparzysz/s.hex/t/hwloop-dbg.c] [DW_LANG_C99]
 !2 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 786478, metadata !28, null, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (i32*, i32*)* @foo, null, null, metadata !11, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
 !6 = metadata !{i32 786473, metadata !28} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{i32 786453, i32 0, null, i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{null, metadata !9, metadata !9}
 !9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 32, align 32, offset 0] [from int]
 !10 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
@@ -60,3 +61,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !26 = metadata !{i32 3, i32 23, metadata !20, null}
 !27 = metadata !{i32 6, i32 1, metadata !16, null}
 !28 = metadata !{metadata !"hwloop-dbg.c", metadata !"/usr2/kparzysz/s.hex/t"}
+!29 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/Hexagon/lit.local.cfg b/test/CodeGen/Hexagon/lit.local.cfg
index 24324b2..e96bab8 100644
--- a/test/CodeGen/Hexagon/lit.local.cfg
+++ b/test/CodeGen/Hexagon/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'Hexagon' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/Hexagon/memops.ll b/test/CodeGen/Hexagon/memops.ll
index 5498848..fca1a73 100644
--- a/test/CodeGen/Hexagon/memops.ll
+++ b/test/CodeGen/Hexagon/memops.ll
@@ -4,11 +4,11 @@
 define void @memop_unsigned_char_add5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv = zext i8 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
@@ -16,11 +16,11 @@ define void @memop_unsigned_char_add(i8* nocapture %p, i8 zeroext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv1 = zext i8 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %p, align 1, !tbaa !0
+  store i8 %conv2, i8* %p, align 1
   ret void
 }
 
@@ -28,51 +28,51 @@ define void @memop_unsigned_char_sub(i8* nocapture %p, i8 zeroext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv1 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %p, align 1, !tbaa !0
+  store i8 %conv2, i8* %p, align 1
   ret void
 }
 
 define void @memop_unsigned_char_or(i8* nocapture %p, i8 zeroext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %p, align 1, !tbaa !0
+  store i8 %or3, i8* %p, align 1
   ret void
 }
 
 define void @memop_unsigned_char_and(i8* nocapture %p, i8 zeroext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %p, align 1, !tbaa !0
+  store i8 %and3, i8* %p, align 1
   ret void
 }
 
 define void @memop_unsigned_char_clrbit(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv = zext i8 %0 to i32
   %and = and i32 %conv, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
 define void @memop_unsigned_char_setbit(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv = zext i8 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
@@ -80,11 +80,11 @@ define void @memop_unsigned_char_add5_index(i8* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -93,11 +93,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv1 = zext i8 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -106,11 +106,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv1 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -118,9 +118,9 @@ define void @memop_unsigned_char_or_index(i8* nocapture %p, i32 %i, i8 zeroext %
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %or3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -128,9 +128,9 @@ define void @memop_unsigned_char_and_index(i8* nocapture %p, i32 %i, i8 zeroext
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %and3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -138,11 +138,11 @@ define void @memop_unsigned_char_clrbit_index(i8* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %and = and i32 %conv, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -150,11 +150,11 @@ define void @memop_unsigned_char_setbit_index(i8* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -162,11 +162,11 @@ define void @memop_unsigned_char_add5_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -175,11 +175,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv1 = zext i8 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -188,11 +188,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv1 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -200,9 +200,9 @@ define void @memop_unsigned_char_or_index5(i8* nocapture %p, i8 zeroext %x) noun
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %or3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -210,9 +210,9 @@ define void @memop_unsigned_char_and_index5(i8* nocapture %p, i8 zeroext %x) nou
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %and3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -220,11 +220,11 @@ define void @memop_unsigned_char_clrbit_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %and = and i32 %conv, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -232,22 +232,22 @@ define void @memop_unsigned_char_setbit_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv = zext i8 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
 define void @memop_signed_char_add5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv2 = zext i8 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
@@ -255,11 +255,11 @@ define void @memop_signed_char_add(i8* nocapture %p, i8 signext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv13 = zext i8 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %p, align 1, !tbaa !0
+  store i8 %conv2, i8* %p, align 1
   ret void
 }
 
@@ -267,51 +267,51 @@ define void @memop_signed_char_sub(i8* nocapture %p, i8 signext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv13 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %p, align 1, !tbaa !0
+  store i8 %conv2, i8* %p, align 1
   ret void
 }
 
 define void @memop_signed_char_or(i8* nocapture %p, i8 signext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %p, align 1, !tbaa !0
+  store i8 %or3, i8* %p, align 1
   ret void
 }
 
 define void @memop_signed_char_and(i8* nocapture %p, i8 signext %x) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %p, align 1, !tbaa !0
+  store i8 %and3, i8* %p, align 1
   ret void
 }
 
 define void @memop_signed_char_clrbit(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv2 = zext i8 %0 to i32
   %and = and i32 %conv2, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
 define void @memop_signed_char_setbit(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i8* %p, align 1, !tbaa !0
+  %0 = load i8* %p, align 1
   %conv2 = zext i8 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %p, align 1, !tbaa !0
+  store i8 %conv1, i8* %p, align 1
   ret void
 }
 
@@ -319,11 +319,11 @@ define void @memop_signed_char_add5_index(i8* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -332,11 +332,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv13 = zext i8 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -345,11 +345,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv13 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -357,9 +357,9 @@ define void @memop_signed_char_or_index(i8* nocapture %p, i32 %i, i8 signext %x)
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %or3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -367,9 +367,9 @@ define void @memop_signed_char_and_index(i8* nocapture %p, i32 %i, i8 signext %x
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %and3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -377,11 +377,11 @@ define void @memop_signed_char_clrbit_index(i8* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %and = and i32 %conv2, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -389,11 +389,11 @@ define void @memop_signed_char_setbit_index(i8* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 %i
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -401,11 +401,11 @@ define void @memop_signed_char_add5_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -414,11 +414,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv13 = zext i8 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -427,11 +427,11 @@ entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i8 %x to i32
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv13 = zext i8 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i8
-  store i8 %conv2, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv2, i8* %add.ptr, align 1
   ret void
 }
 
@@ -439,9 +439,9 @@ define void @memop_signed_char_or_index5(i8* nocapture %p, i8 signext %x) nounwi
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %or3 = or i8 %0, %x
-  store i8 %or3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %or3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -449,9 +449,9 @@ define void @memop_signed_char_and_index5(i8* nocapture %p, i8 signext %x) nounw
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %and3 = and i8 %0, %x
-  store i8 %and3, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %and3, i8* %add.ptr, align 1
   ret void
 }
 
@@ -459,11 +459,11 @@ define void @memop_signed_char_clrbit_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %and = and i32 %conv2, 223
   %conv1 = trunc i32 %and to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
@@ -471,22 +471,22 @@ define void @memop_signed_char_setbit_index5(i8* nocapture %p) nounwind {
 entry:
 ; CHECK:  memb(r{{[0-9]+}}{{ *}}+{{ *}}#5){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i8* %p, i32 5
-  %0 = load i8* %add.ptr, align 1, !tbaa !0
+  %0 = load i8* %add.ptr, align 1
   %conv2 = zext i8 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i8
-  store i8 %conv1, i8* %add.ptr, align 1, !tbaa !0
+  store i8 %conv1, i8* %add.ptr, align 1
   ret void
 }
 
 define void @memop_unsigned_short_add5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv = zext i16 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
@@ -494,11 +494,11 @@ define void @memop_unsigned_short_add(i16* nocapture %p, i16 zeroext %x) nounwin
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv1 = zext i16 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %p, align 2, !tbaa !2
+  store i16 %conv2, i16* %p, align 2
   ret void
 }
 
@@ -506,51 +506,51 @@ define void @memop_unsigned_short_sub(i16* nocapture %p, i16 zeroext %x) nounwin
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv1 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %p, align 2, !tbaa !2
+  store i16 %conv2, i16* %p, align 2
   ret void
 }
 
 define void @memop_unsigned_short_or(i16* nocapture %p, i16 zeroext %x) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %p, align 2, !tbaa !2
+  store i16 %or3, i16* %p, align 2
   ret void
 }
 
 define void @memop_unsigned_short_and(i16* nocapture %p, i16 zeroext %x) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %p, align 2, !tbaa !2
+  store i16 %and3, i16* %p, align 2
   ret void
 }
 
 define void @memop_unsigned_short_clrbit(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv = zext i16 %0 to i32
   %and = and i32 %conv, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
 define void @memop_unsigned_short_setbit(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv = zext i16 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
@@ -558,11 +558,11 @@ define void @memop_unsigned_short_add5_index(i16* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -571,11 +571,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv1 = zext i16 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -584,11 +584,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv1 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -596,9 +596,9 @@ define void @memop_unsigned_short_or_index(i16* nocapture %p, i32 %i, i16 zeroex
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %or3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -606,9 +606,9 @@ define void @memop_unsigned_short_and_index(i16* nocapture %p, i32 %i, i16 zeroe
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %and3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -616,11 +616,11 @@ define void @memop_unsigned_short_clrbit_index(i16* nocapture %p, i32 %i) nounwi
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %and = and i32 %conv, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -628,11 +628,11 @@ define void @memop_unsigned_short_setbit_index(i16* nocapture %p, i32 %i) nounwi
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -640,11 +640,11 @@ define void @memop_unsigned_short_add5_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %add = add nsw i32 %conv, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -653,11 +653,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv1 = zext i16 %0 to i32
   %add = add nsw i32 %conv1, %conv
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -666,11 +666,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}-={{ *}}r{{[0-9]+}}
   %conv = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv1 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv1, %conv
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -678,9 +678,9 @@ define void @memop_unsigned_short_or_index5(i16* nocapture %p, i16 zeroext %x) n
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %or3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -688,9 +688,9 @@ define void @memop_unsigned_short_and_index5(i16* nocapture %p, i16 zeroext %x)
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %and3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -698,11 +698,11 @@ define void @memop_unsigned_short_clrbit_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %and = and i32 %conv, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -710,22 +710,22 @@ define void @memop_unsigned_short_setbit_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv = zext i16 %0 to i32
   %or = or i32 %conv, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
 define void @memop_signed_short_add5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv2 = zext i16 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
@@ -733,11 +733,11 @@ define void @memop_signed_short_add(i16* nocapture %p, i16 signext %x) nounwind
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv13 = zext i16 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %p, align 2, !tbaa !2
+  store i16 %conv2, i16* %p, align 2
   ret void
 }
 
@@ -745,51 +745,51 @@ define void @memop_signed_short_sub(i16* nocapture %p, i16 signext %x) nounwind
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv13 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %p, align 2, !tbaa !2
+  store i16 %conv2, i16* %p, align 2
   ret void
 }
 
 define void @memop_signed_short_or(i16* nocapture %p, i16 signext %x) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %p, align 2, !tbaa !2
+  store i16 %or3, i16* %p, align 2
   ret void
 }
 
 define void @memop_signed_short_and(i16* nocapture %p, i16 signext %x) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %p, align 2, !tbaa !2
+  store i16 %and3, i16* %p, align 2
   ret void
 }
 
 define void @memop_signed_short_clrbit(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv2 = zext i16 %0 to i32
   %and = and i32 %conv2, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
 define void @memop_signed_short_setbit(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i16* %p, align 2, !tbaa !2
+  %0 = load i16* %p, align 2
   %conv2 = zext i16 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %p, align 2, !tbaa !2
+  store i16 %conv1, i16* %p, align 2
   ret void
 }
 
@@ -797,11 +797,11 @@ define void @memop_signed_short_add5_index(i16* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -810,11 +810,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv13 = zext i16 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -823,11 +823,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv13 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -835,9 +835,9 @@ define void @memop_signed_short_or_index(i16* nocapture %p, i32 %i, i16 signext
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %or3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -845,9 +845,9 @@ define void @memop_signed_short_and_index(i16* nocapture %p, i32 %i, i16 signext
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %and3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -855,11 +855,11 @@ define void @memop_signed_short_clrbit_index(i16* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %and = and i32 %conv2, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -867,11 +867,11 @@ define void @memop_signed_short_setbit_index(i16* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 %i
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -879,11 +879,11 @@ define void @memop_signed_short_add5_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %add = add nsw i32 %conv2, 5
   %conv1 = trunc i32 %add to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -892,11 +892,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}+={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv13 = zext i16 %0 to i32
   %add = add nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %add to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -905,11 +905,11 @@ entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}-={{ *}}r{{[0-9]+}}
   %conv4 = zext i16 %x to i32
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv13 = zext i16 %0 to i32
   %sub = sub nsw i32 %conv13, %conv4
   %conv2 = trunc i32 %sub to i16
-  store i16 %conv2, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv2, i16* %add.ptr, align 2
   ret void
 }
 
@@ -917,9 +917,9 @@ define void @memop_signed_short_or_index5(i16* nocapture %p, i16 signext %x) nou
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %or3 = or i16 %0, %x
-  store i16 %or3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %or3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -927,9 +927,9 @@ define void @memop_signed_short_and_index5(i16* nocapture %p, i16 signext %x) no
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %and3 = and i16 %0, %x
-  store i16 %and3, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %and3, i16* %add.ptr, align 2
   ret void
 }
 
@@ -937,11 +937,11 @@ define void @memop_signed_short_clrbit_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %and = and i32 %conv2, 65503
   %conv1 = trunc i32 %and to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
@@ -949,74 +949,74 @@ define void @memop_signed_short_setbit_index5(i16* nocapture %p) nounwind {
 entry:
 ; CHECK:  memh(r{{[0-9]+}}{{ *}}+{{ *}}#10){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i16* %p, i32 5
-  %0 = load i16* %add.ptr, align 2, !tbaa !2
+  %0 = load i16* %add.ptr, align 2
   %conv2 = zext i16 %0 to i32
   %or = or i32 %conv2, 128
   %conv1 = trunc i32 %or to i16
-  store i16 %conv1, i16* %add.ptr, align 2, !tbaa !2
+  store i16 %conv1, i16* %add.ptr, align 2
   ret void
 }
 
 define void @memop_signed_int_add5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %add = add i32 %0, 5
-  store i32 %add, i32* %p, align 4, !tbaa !3
+  store i32 %add, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_add(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %add = add i32 %0, %x
-  store i32 %add, i32* %p, align 4, !tbaa !3
+  store i32 %add, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_sub(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %sub = sub i32 %0, %x
-  store i32 %sub, i32* %p, align 4, !tbaa !3
+  store i32 %sub, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_or(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %p, align 4, !tbaa !3
+  store i32 %or, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_and(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %p, align 4, !tbaa !3
+  store i32 %and, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_clrbit(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %p, align 4, !tbaa !3
+  store i32 %and, i32* %p, align 4
   ret void
 }
 
 define void @memop_signed_int_setbit(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %p, align 4, !tbaa !3
+  store i32 %or, i32* %p, align 4
   ret void
 }
 
@@ -1024,9 +1024,9 @@ define void @memop_signed_int_add5_index(i32* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add i32 %0, 5
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1034,9 +1034,9 @@ define void @memop_signed_int_add_index(i32* nocapture %p, i32 %i, i32 %x) nounw
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add i32 %0, %x
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1044,9 +1044,9 @@ define void @memop_signed_int_sub_index(i32* nocapture %p, i32 %i, i32 %x) nounw
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %sub = sub i32 %0, %x
-  store i32 %sub, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %sub, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1054,9 +1054,9 @@ define void @memop_signed_int_or_index(i32* nocapture %p, i32 %i, i32 %x) nounwi
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1064,9 +1064,9 @@ define void @memop_signed_int_and_index(i32* nocapture %p, i32 %i, i32 %x) nounw
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1074,9 +1074,9 @@ define void @memop_signed_int_clrbit_index(i32* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1084,9 +1084,9 @@ define void @memop_signed_int_setbit_index(i32* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1094,9 +1094,9 @@ define void @memop_signed_int_add5_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add i32 %0, 5
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1104,9 +1104,9 @@ define void @memop_signed_int_add_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add i32 %0, %x
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1114,9 +1114,9 @@ define void @memop_signed_int_sub_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}-={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %sub = sub i32 %0, %x
-  store i32 %sub, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %sub, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1124,9 +1124,9 @@ define void @memop_signed_int_or_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1134,9 +1134,9 @@ define void @memop_signed_int_and_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1144,9 +1144,9 @@ define void @memop_signed_int_clrbit_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1154,72 +1154,72 @@ define void @memop_signed_int_setbit_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
 define void @memop_unsigned_int_add5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %add = add nsw i32 %0, 5
-  store i32 %add, i32* %p, align 4, !tbaa !3
+  store i32 %add, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_add(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %add = add nsw i32 %0, %x
-  store i32 %add, i32* %p, align 4, !tbaa !3
+  store i32 %add, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_sub(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %sub = sub nsw i32 %0, %x
-  store i32 %sub, i32* %p, align 4, !tbaa !3
+  store i32 %sub, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_or(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %p, align 4, !tbaa !3
+  store i32 %or, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_and(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %p, align 4, !tbaa !3
+  store i32 %and, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_clrbit(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %p, align 4, !tbaa !3
+  store i32 %and, i32* %p, align 4
   ret void
 }
 
 define void @memop_unsigned_int_setbit(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
-  %0 = load i32* %p, align 4, !tbaa !3
+  %0 = load i32* %p, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %p, align 4, !tbaa !3
+  store i32 %or, i32* %p, align 4
   ret void
 }
 
@@ -1227,9 +1227,9 @@ define void @memop_unsigned_int_add5_index(i32* nocapture %p, i32 %i) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add nsw i32 %0, 5
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1237,9 +1237,9 @@ define void @memop_unsigned_int_add_index(i32* nocapture %p, i32 %i, i32 %x) nou
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}+={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add nsw i32 %0, %x
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1247,9 +1247,9 @@ define void @memop_unsigned_int_sub_index(i32* nocapture %p, i32 %i, i32 %x) nou
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}-={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %sub = sub nsw i32 %0, %x
-  store i32 %sub, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %sub, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1257,9 +1257,9 @@ define void @memop_unsigned_int_or_index(i32* nocapture %p, i32 %i, i32 %x) noun
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1267,9 +1267,9 @@ define void @memop_unsigned_int_and_index(i32* nocapture %p, i32 %i, i32 %x) nou
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1277,9 +1277,9 @@ define void @memop_unsigned_int_clrbit_index(i32* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1287,9 +1287,9 @@ define void @memop_unsigned_int_setbit_index(i32* nocapture %p, i32 %i) nounwind
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#0){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 %i
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1297,9 +1297,9 @@ define void @memop_unsigned_int_add5_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}#5
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add nsw i32 %0, 5
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1307,9 +1307,9 @@ define void @memop_unsigned_int_add_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}+={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %add = add nsw i32 %0, %x
-  store i32 %add, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %add, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1317,9 +1317,9 @@ define void @memop_unsigned_int_sub_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}-={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %sub = sub nsw i32 %0, %x
-  store i32 %sub, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %sub, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1327,9 +1327,9 @@ define void @memop_unsigned_int_or_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}|={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, %x
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1337,9 +1337,9 @@ define void @memop_unsigned_int_and_index5(i32* nocapture %p, i32 %x) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}&={{ *}}r{{[0-9]+}}
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, %x
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1347,9 +1347,9 @@ define void @memop_unsigned_int_clrbit_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}clrbit({{ *}}#5{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %and = and i32 %0, -33
-  store i32 %and, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %and, i32* %add.ptr, align 4
   ret void
 }
 
@@ -1357,13 +1357,8 @@ define void @memop_unsigned_int_setbit_index5(i32* nocapture %p) nounwind {
 entry:
 ; CHECK:  memw(r{{[0-9]+}}{{ *}}+{{ *}}#20){{ *}}={{ *}}setbit({{ *}}#7{{ *}})
   %add.ptr = getelementptr inbounds i32* %p, i32 5
-  %0 = load i32* %add.ptr, align 4, !tbaa !3
+  %0 = load i32* %add.ptr, align 4
   %or = or i32 %0, 128
-  store i32 %or, i32* %add.ptr, align 4, !tbaa !3
+  store i32 %or, i32* %add.ptr, align 4
   ret void
 }
-
-!0 = metadata !{metadata !"omnipotent char", metadata !1}
-!1 = metadata !{metadata !"Simple C/C++ TBAA"}
-!2 = metadata !{metadata !"short", metadata !0}
-!3 = metadata !{metadata !"int", metadata !0}
diff --git a/test/CodeGen/Hexagon/union-1.ll b/test/CodeGen/Hexagon/union-1.ll
index 7c6da74..fe79f95 100644
--- a/test/CodeGen/Hexagon/union-1.ll
+++ b/test/CodeGen/Hexagon/union-1.ll
@@ -5,10 +5,10 @@
 
 define void @word(i32* nocapture %a) nounwind {
 entry:
-  %0 = load i32* %a, align 4, !tbaa !0
+  %0 = load i32* %a, align 4
   %1 = zext i32 %0 to i64
   %add.ptr = getelementptr inbounds i32* %a, i32 1
-  %2 = load i32* %add.ptr, align 4, !tbaa !0
+  %2 = load i32* %add.ptr, align 4
   %3 = zext i32 %2 to i64
   %4 = shl nuw i64 %3, 32
   %ins = or i64 %4, %1
@@ -17,7 +17,3 @@ entry:
 }
 
 declare void @bar(i64)
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Inputs/DbgValueOtherTargets.ll b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
index f35a5d1..953e576 100644
--- a/test/CodeGen/Inputs/DbgValueOtherTargets.ll
+++ b/test/CodeGen/Inputs/DbgValueOtherTargets.ll
@@ -12,11 +12,12 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!13}
 
 !0 = metadata !{i32 786478, metadata !12, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !12} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !12, i32 12, metadata !"clang version 2.9 (trunk 120996)", i1 false, metadata !"", i32 0, metadata !6, metadata !6, metadata !11, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !12, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, metadata !12, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 0}
@@ -26,3 +27,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !10 = metadata !{i32 4, i32 2, metadata !8, null}
 !11 = metadata !{metadata !0}
 !12 = metadata !{metadata !"/tmp/x.c", metadata !"/Users/manav"}
+!13 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/MSP430/cc_args.ll b/test/CodeGen/MSP430/cc_args.ll
new file mode 100644
index 0000000..39e99e2
--- /dev/null
+++ b/test/CodeGen/MSP430/cc_args.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+define void @test() #0 {
+entry:
+; CHECK: test:
+
+; CHECK: mov.w #1, r15
+; CHECK: call #f_i16
+  call void @f_i16(i16 1)
+
+; CHECK: mov.w #772, r14
+; CHECK: mov.w #258, r15
+; CHECK: call #f_i32
+  call void @f_i32(i32 16909060)
+
+; CHECK: mov.w #1800, r12
+; CHECK: mov.w #1286, r13
+; CHECK: mov.w #772, r14
+; CHECK: mov.w #258, r15
+; CHECK: call #f_i64
+  call void @f_i64(i64 72623859790382856)
+
+; CHECK: mov.w #772, r14
+; CHECK: mov.w #258, r15
+; CHECK: mov.w #1800, r12
+; CHECK: mov.w #1286, r13
+; CHECK: call #f_i32_i32
+  call void @f_i32_i32(i32 16909060, i32 84281096)
+
+; CHECK: mov.w #1, r15
+; CHECK: mov.w #772, r13
+; CHECK: mov.w #258, r14
+; CHECK: mov.w #2, r12
+; CHECK: call #f_i16_i32_i16
+  call void @f_i16_i32_i16(i16 1, i32 16909060, i16 2)
+
+; CHECK: mov.w #2, 8(r1)
+; CHECK: mov.w #258, 6(r1)
+; CHECK: mov.w #772, 4(r1)
+; CHECK: mov.w #1286, 2(r1)
+; CHECK: mov.w #1800, 0(r1)
+; CHECK: mov.w #1, r15
+; CHECK: call #f_i16_i64_i16
+  call void @f_i16_i64_i16(i16 1, i64 72623859790382856, i16 2)
+
+  ret void
+}
+
+@g_i16 = common global i16 0, align 2
+@g_i32 = common global i32 0, align 2
+@g_i64 = common global i64 0, align 2
+
+define void @f_i16(i16 %a) #0 {
+; CHECK: f_i16:
+; CHECK: mov.w r15, &g_i16
+  store volatile i16 %a, i16* @g_i16, align 2
+  ret void
+}
+
+define void @f_i32(i32 %a) #0 {
+; CHECK: f_i32:
+; CHECK: mov.w r15, &g_i32+2
+; CHECK: mov.w r14, &g_i32
+  store volatile i32 %a, i32* @g_i32, align 2
+  ret void
+}
+
+define void @f_i64(i64 %a) #0 {
+; CHECK: f_i64:
+; CHECK: mov.w r15, &g_i64+6
+; CHECK: mov.w r14, &g_i64+4
+; CHECK: mov.w r13, &g_i64+2
+; CHECK: mov.w r12, &g_i64
+  store volatile i64 %a, i64* @g_i64, align 2
+  ret void
+}
+
+define void @f_i32_i32(i32 %a, i32 %b) #0 {
+; CHECK: f_i32_i32:
+; CHECK: mov.w r15, &g_i32+2
+; CHECK: mov.w r14, &g_i32
+  store volatile i32 %a, i32* @g_i32, align 2
+; CHECK: mov.w r13, &g_i32+2
+; CHECK: mov.w r12, &g_i32
+  store volatile i32 %b, i32* @g_i32, align 2
+  ret void
+}
+
+define void @f_i16_i32_i16(i16 %a, i32 %b, i16 %c) #0 {
+; CHECK: f_i16_i32_i16:
+; CHECK: mov.w r15, &g_i16
+  store volatile i16 %a, i16* @g_i16, align 2
+; CHECK: mov.w r14, &g_i32+2
+; CHECK: mov.w r13, &g_i32
+  store volatile i32 %b, i32* @g_i32, align 2
+; CHECK: mov.w r12, &g_i16
+  store volatile i16 %c, i16* @g_i16, align 2
+  ret void
+}
+
+define void @f_i16_i64_i16(i16 %a, i64 %b, i16 %c) #0 {
+; CHECK: f_i16_i64_i16:
+; CHECK: mov.w r15, &g_i16
+  store volatile i16 %a, i16* @g_i16, align 2
+;CHECK: mov.w 10(r4), &g_i64+6
+;CHECK: mov.w 8(r4), &g_i64+4
+;CHECK: mov.w 6(r4), &g_i64+2
+;CHECK: mov.w 4(r4), &g_i64
+  store volatile i64 %b, i64* @g_i64, align 2
+;CHECK: mov.w 12(r4), &g_i16
+  store volatile i16 %c, i16* @g_i16, align 2
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/MSP430/cc_ret.ll b/test/CodeGen/MSP430/cc_ret.ll
new file mode 100644
index 0000000..c2a9ae6
--- /dev/null
+++ b/test/CodeGen/MSP430/cc_ret.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+define void @test() #0 {
+entry:
+; CHECK: test:
+
+; CHECK: call #f_i16
+; CHECK: mov.w r15, &g_i16
+  %0 = call i16 @f_i16()
+  store volatile i16 %0, i16* @g_i16
+
+; CHECK: call #f_i32
+; CHECK: mov.w r15, &g_i32+2
+; CHECK: mov.w r14, &g_i32
+  %1 = call i32 @f_i32()
+  store volatile i32 %1, i32* @g_i32
+
+; CHECK: call #f_i64
+; CHECK: mov.w r15, &g_i64+6
+; CHECK: mov.w r14, &g_i64+4
+; CHECK: mov.w r13, &g_i64+2
+; CHECK: mov.w r12, &g_i64
+  %2 = call i64 @f_i64()
+  store volatile i64 %2, i64* @g_i64
+
+  ret void
+}
+
+@g_i16 = common global i16 0, align 2
+@g_i32 = common global i32 0, align 2
+@g_i64 = common global i64 0, align 2
+
+define i16 @f_i16() #0 {
+; CHECK: f_i16:
+; CHECK: mov.w #1, r15
+; CHECK: ret
+  ret i16 1
+}
+
+define i32 @f_i32() #0 {
+; CHECK: f_i32:
+; CHECK: mov.w #772, r14
+; CHECK: mov.w #258, r15
+; CHECK: ret
+  ret i32 16909060
+}
+
+define i64 @f_i64() #0 {
+; CHECK: f_i64:
+; CHECK: mov.w #1800, r12
+; CHECK: mov.w #1286, r13
+; CHECK: mov.w #772, r14
+; CHECK: mov.w #258, r15
+; CHECK: ret
+  ret i64 72623859790382856
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/MSP430/lit.local.cfg b/test/CodeGen/MSP430/lit.local.cfg
index 0ca9fc9..a18fe6f 100644
--- a/test/CodeGen/MSP430/lit.local.cfg
+++ b/test/CodeGen/MSP430/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'MSP430' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/MSP430/transient-stack-alignment.ll b/test/CodeGen/MSP430/transient-stack-alignment.ll
new file mode 100644
index 0000000..cca8350
--- /dev/null
+++ b/test/CodeGen/MSP430/transient-stack-alignment.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
+target triple = "msp430---elf"
+
+define void @test() #0 {
+; CHECK-LABEL: test:
+; CHECK: sub.w #2, r1
+  %1 = alloca i8, align 1
+; CHECK-NEXT: mov.b #0, 1(r1)
+  store i8 0, i8* %1, align 1
+; CHECK-NEXT: add.w #2, r1
+; CHECK-NEXT: ret
+  ret void
+}
+
+attributes #0 = { nounwind "no-frame-pointer-elim"="false" }
diff --git a/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll b/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
index 8479ad2..3381143 100644
--- a/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
+++ b/test/CodeGen/Mips/2008-07-16-SignExtInReg.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=mips -mcpu=mips32r2 | FileCheck %s 
 ; RUN: llc < %s -march=mips64 -mcpu=mips64r2 | FileCheck %s 
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 -soft-float -mips16-hard-float   < %s | FileCheck %s 
 
 define signext i8 @A(i8 %e.0, i8 signext %sum)  nounwind {
 entry:
diff --git a/test/CodeGen/Mips/2008-08-01-AsmInline.ll b/test/CodeGen/Mips/2008-08-01-AsmInline.ll
index dbde742..e274bc0 100644
--- a/test/CodeGen/Mips/2008-08-01-AsmInline.ll
+++ b/test/CodeGen/Mips/2008-08-01-AsmInline.ll
@@ -51,3 +51,21 @@ entry:
   ret void
 }
 
+; Check that RA doesn't allocate registers in the clobber list.
+; CHECK-LABEL: foo4:
+; CHECK: #APP
+; CHECK-NOT: ulh $2
+; CHECK: #NO_APP
+; CHECK: #APP
+; CHECK-NOT: $f0
+; CHECK: #NO_APP
+
+define void @foo4() {
+entry:
+  %0 = tail call i32 asm sideeffect "ulh $0,16($$sp)\0A\09", "=r,~{$2}"()
+  store i32 %0, i32* @gi2, align 4
+  %1 = load float* @gf0, align 4
+  %2 = tail call double asm sideeffect "cvt.d.s $0, $1\0A\09", "=f,f,~{$f0}"(float %1)
+  store double %2, double* @gd0, align 8
+  ret void
+}
diff --git a/test/CodeGen/Mips/2013-11-18-fp64-const0.ll b/test/CodeGen/Mips/2013-11-18-fp64-const0.ll
new file mode 100644
index 0000000..f8390d9
--- /dev/null
+++ b/test/CodeGen/Mips/2013-11-18-fp64-const0.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=mips -mattr=-fp64 < %s | FileCheck -check-prefix=CHECK-FP32 %s
+; RUN: llc -march=mips -mattr=+fp64 < %s | FileCheck -check-prefix=CHECK-FP64 %s
+
+; This test case is a simplified version of an llvm-stress generated test with
+; seed=3718491962.
+; It originally failed on MIPS32 with FP64 with the following error:
+;     LLVM ERROR: ran out of registers during register allocation
+; This was caused by impossible register class restrictions caused by the use
+; of BuildPairF64 instead of BuildPairF64_64.
+
+define void @autogen_SD3718491962() {
+BB:
+  ; CHECK-FP32: mtc1 $zero, $f{{[0-3]*[02468]}}
+  ; CHECK-FP32: mtc1 $zero, $f{{[0-3]*[13579]}}
+
+  ; CHECK-FP64: mtc1 $zero, $f{{[0-9]+}}
+  ; CHECK-FP64-NOT: mtc1 $zero,
+  ; FIXME: A redundant mthc1 is currently emitted. Add a -NOT when it is
+  ;        eliminated
+
+  %Cmp = fcmp ule double 0.000000e+00, undef
+  %Cmp11 = fcmp ueq double 0xFDBD965CF1BB7FDA, undef
+  br label %CF88
+
+CF88:                                             ; preds = %CF86
+  %Sl18 = select i1 %Cmp, i1 %Cmp11, i1 %Cmp
+  br i1 %Sl18, label %CF88, label %CF85
+
+CF85:                                             ; preds = %CF88
+  ret void
+}
diff --git a/test/CodeGen/Mips/beqzc.ll b/test/CodeGen/Mips/beqzc.ll
new file mode 100644
index 0000000..4a294c2
--- /dev/null
+++ b/test/CodeGen/Mips/beqzc.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+
+@i = global i32 0, align 4
+@j = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define i32 @main() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  %. = select i1 %cmp, i32 10, i32 55
+  store i32 %., i32* @j, align 4
+; cond-b-short: 	beqz	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}  # 16 bit inst
+  ret i32 0
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+
+
diff --git a/test/CodeGen/Mips/beqzc1.ll b/test/CodeGen/Mips/beqzc1.ll
new file mode 100644
index 0000000..8f929a8
--- /dev/null
+++ b/test/CodeGen/Mips/beqzc1.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+
+@i = global i32 0, align 4
+@j = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define i32 @main() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; cond-b-short: 	bnez	${{[0-9]+}}, $BB{{[0-9]+}}_{{[0-9]+}}  # 16 bit inst
+if.then:                                          ; preds = %entry
+  store i32 10, i32* @j, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i32 0
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+
diff --git a/test/CodeGen/Mips/blockaddr.ll b/test/CodeGen/Mips/blockaddr.ll
index 7de7fa6..beab65f 100644
--- a/test/CodeGen/Mips/blockaddr.ll
+++ b/test/CodeGen/Mips/blockaddr.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n32 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N32
 ; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC-N64
 ; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -relocation-model=static < %s | FileCheck %s -check-prefix=STATIC-N64
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -soft-float -mips16-hard-float -relocation-model=static   < %s | FileCheck %s -check-prefix=STATIC-MIPS16-1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -soft-float -mips16-hard-float -relocation-model=static   < %s | FileCheck %s -check-prefix=STATIC-MIPS16-2
 
 @reg = common global i8* null, align 4
 
@@ -36,6 +38,14 @@ entry:
 ; STATIC-N64: daddiu ${{[0-9]+}}, $[[R2]], %got_ofst($tmp[[T2]])
 ; STATIC-N64: ld  $[[R3:[0-9]+]], %got_page($tmp[[T3:[0-9]+]])
 ; STATIC-N64: daddiu ${{[0-9]+}}, $[[R3]], %got_ofst($tmp[[T3]])
+; STATIC-MIPS16-1: .ent	f
+; STATIC-MIPS16-2: .ent	f
+; STATIC-MIPS16-1: li  $[[R1_16:[0-9]+]], %hi($tmp[[TI_16:[0-9]+]])
+; STATIC-MIPS16-1: sll ${{[0-9]+}},  $[[R1_16]], 16
+; STATIC-MIPS16-2: li  ${{[0-9]+}}, %lo($tmp{{[0-9]+}})
+; STATIC-MIPS16-1 jal	dummy
+; STATIC-MIPS16-2 jal	dummy
+
 define void @f() nounwind {
 entry:
   %call = tail call i8* @dummy(i8* blockaddress(@f, %baz))
diff --git a/test/CodeGen/Mips/brdelayslot.ll b/test/CodeGen/Mips/brdelayslot.ll
index 869ecd9..68341c1 100644
--- a/test/CodeGen/Mips/brdelayslot.ll
+++ b/test/CodeGen/Mips/brdelayslot.ll
@@ -160,7 +160,14 @@ for.end:                                          ; preds = %for.body, %entry
 ;
 ; SUCCBB-LABEL:      succbbs_br1:
 ; SUCCBB:      beqz ${{[0-9]+}}, $BB
-; SUCCBB-NEXT: lw $25, %call16(foo100)
+; SUCCBB-NEXT: lw ${{[0-9]+}}, %got(foo101)(${{[0-9]+}})
+
+define internal fastcc void @foo101() {
+entry:
+  tail call void @foo100()
+  tail call void @foo100()
+  ret void
+}
 
 define void @succbbs_br1(i32 %a) {
 entry:
@@ -168,7 +175,7 @@ entry:
   br i1 %tobool, label %if.end, label %if.then
 
 if.then:                                          ; preds = %entry
-  tail call void @foo100() #1
+  tail call fastcc void @foo101()
   br label %if.end
 
 if.end:                                           ; preds = %entry, %if.then
diff --git a/test/CodeGen/Mips/brsize3.ll b/test/CodeGen/Mips/brsize3.ll
new file mode 100644
index 0000000..7b1f440
--- /dev/null
+++ b/test/CodeGen/Mips/brsize3.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-no-short
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-long
+
+; ModuleID = 'brsize3.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+; Function Attrs: noreturn nounwind optsize
+define void @foo() #0 {
+entry:
+  br label %x
+
+x:                                                ; preds = %x, %entry
+  tail call void asm sideeffect ".space 60000", ""() #1, !srcloc !1
+  br label %x
+; b-long: $BB0_1:
+; b-long:	#APP
+; b-long:	.space 60000
+; b-long:	#NO_APP
+; b-long:	b	$BB0_1
+; b-no-short: $BB0_1:
+; b-no-short:	#APP
+; b-no-short:	.space 60000
+; b-no-short:	#NO_APP
+; b-no-short-NOT:	b	$BB0_1 # 16 bit inst
+
+}
+
+attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind }
+
+!1 = metadata !{i32 45}
diff --git a/test/CodeGen/Mips/brsize3a.ll b/test/CodeGen/Mips/brsize3a.ll
new file mode 100644
index 0000000..6382fa2
--- /dev/null
+++ b/test/CodeGen/Mips/brsize3a.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=b-short
+
+; ModuleID = 'brsize3.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+; Function Attrs: noreturn nounwind optsize
+define void @foo() #0 {
+entry:
+  br label %x
+
+x:                                                ; preds = %x, %entry
+  tail call void asm sideeffect ".space 200", ""() #1, !srcloc !1
+  br label %x
+; b-short: $BB0_1:
+; b-short:	#APP
+; b-short:	.space 200
+; b-short:	#NO_APP
+; b-short:	b	$BB0_1 # 16 bit inst
+
+}
+
+attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind }
+
+!1 = metadata !{i32 45}
diff --git a/test/CodeGen/Mips/bswap.ll b/test/CodeGen/Mips/bswap.ll
index 0da2d2b..f17b91a 100644
--- a/test/CodeGen/Mips/bswap.ll
+++ b/test/CodeGen/Mips/bswap.ll
@@ -1,11 +1,13 @@
 ; RUN: llc  < %s -march=mipsel -mcpu=mips32r2 | FileCheck %s -check-prefix=MIPS32
 ; RUN: llc  < %s -march=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefix=MIPS64
+; RUN: llc  < %s -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 -soft-float -mips16-hard-float   | FileCheck %s -check-prefix=mips16 
 
 define i32 @bswap32(i32 %x) nounwind readnone {
 entry:
 ; MIPS32-LABEL: bswap32:
 ; MIPS32: wsbh $[[R0:[0-9]+]]
 ; MIPS32: rotr ${{[0-9]+}}, $[[R0]], 16
+; mips16: .ent bswap32
   %or.3 = call i32 @llvm.bswap.i32(i32 %x)
   ret i32 %or.3
 }
@@ -15,6 +17,7 @@ entry:
 ; MIPS64-LABEL: bswap64:
 ; MIPS64: dsbh $[[R0:[0-9]+]]
 ; MIPS64: dshd ${{[0-9]+}}, $[[R0]]
+; mips16: .ent bswap64
   %or.7 = call i64 @llvm.bswap.i64(i64 %x)
   ret i64 %or.7
 }
diff --git a/test/CodeGen/Mips/buildpairextractelementf64.ll b/test/CodeGen/Mips/buildpairextractelementf64.ll
index 585bc25..490d427 100644
--- a/test/CodeGen/Mips/buildpairextractelementf64.ll
+++ b/test/CodeGen/Mips/buildpairextractelementf64.ll
@@ -1,20 +1,31 @@
-; RUN: llc  < %s -march=mipsel | FileCheck %s
-; RUN: llc  < %s -march=mips   | FileCheck %s
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=FP32
+; RUN: llc -march=mips  < %s | FileCheck %s -check-prefix=FP32
+; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64
+; RUN: llc -march=mips -mattr=+fp64 < %s | FileCheck %s -check-prefix=FP64
+
 @a = external global i32
 
+; CHECK-LABEL: f:
+; FP32: mtc1
+; FP32: mtc1
+; FP64-DAG: mtc1
+; FP64-DAG: mthc1
+
 define double @f(i32 %a1, double %d) nounwind {
 entry:
-; CHECK: mtc1
-; CHECK: mtc1
   store i32 %a1, i32* @a, align 4
   %add = fadd double %d, 2.000000e+00
   ret double %add
 }
 
+; CHECK-LABEL: f3:
+; FP32: mfc1
+; FP32: mfc1
+; FP64-DAG: mfc1
+; FP64-DAG: mfhc1
+
 define void @f3(double %d, i32 %a1) nounwind {
 entry:
-; CHECK: mfc1
-; CHECK: mfc1
   tail call void @f2(i32 %a1, double %d) nounwind
   ret void
 }
diff --git a/test/CodeGen/Mips/cmplarge.ll b/test/CodeGen/Mips/cmplarge.ll
index b082fa3..2a3d30a 100644
--- a/test/CodeGen/Mips/cmplarge.ll
+++ b/test/CodeGen/Mips/cmplarge.ll
@@ -10,7 +10,7 @@ target triple = "mipsel--linux-gnu"
 define void @getSubImagesLuma(%struct.StorablePicture* nocapture %s) #0 {
 entry:
   %size_y = getelementptr inbounds %struct.StorablePicture* %s, i32 0, i32 1
-  %0 = load i32* %size_y, align 4, !tbaa !0
+  %0 = load i32* %size_y, align 4
   %sub = add nsw i32 %0, -1
   %add5 = add nsw i32 %0, 20
   %cmp6 = icmp sgt i32 %add5, -20
@@ -20,7 +20,7 @@ for.body:                                         ; preds = %entry, %for.body
   %j.07 = phi i32 [ %inc, %for.body ], [ -20, %entry ]
   %call = tail call i32 bitcast (i32 (...)* @iClip3 to i32 (i32, i32, i32)*)(i32 0, i32 %sub, i32 %j.07) #2
   %inc = add nsw i32 %j.07, 1
-  %1 = load i32* %size_y, align 4, !tbaa !0
+  %1 = load i32* %size_y, align 4
   %add = add nsw i32 %1, 20
   %cmp = icmp slt i32 %inc, %add
   br i1 %cmp, label %for.body, label %for.end
@@ -33,10 +33,6 @@ for.end:                                          ; preds = %for.body, %entry
 ; cmp16:	.end	getSubImagesLuma
 declare i32 @iClip3(...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Mips/const1.ll b/test/CodeGen/Mips/const1.ll
new file mode 100644
index 0000000..cb2baca
--- /dev/null
+++ b/test/CodeGen/Mips/const1.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips16-constant-islands < %s | FileCheck %s 
+
+; ModuleID = 'const1.c'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mipsel-unknown-linux"
+
+@i = common global i32 0, align 4
+@j = common global i32 0, align 4
+@k = common global i32 0, align 4
+@l = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @t() #0 {
+entry:
+  store i32 -559023410, i32* @i, align 4
+  store i32 -559023410, i32* @j, align 4
+  store i32 -87105875, i32* @k, align 4
+  store i32 262991277, i32* @l, align 4
+  ret void
+; CHECK: 	lw	${{[0-9]+}}, $CPI0_0
+; CHECK:	lw	${{[0-9]+}}, $CPI0_1
+; CHECK: 	lw	${{[0-9]+}}, $CPI0_2
+; CHECK: $CPI0_0:
+; CHECK:	.4byte	3735943886
+; CHECK: $CPI0_1:
+; CHECK:	.4byte	4207861421
+; CHECK: $CPI0_2:
+; CHECK:	.4byte	262991277
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b754974ec32ab712ea7d8b52cd8037b24e7d6ed3) (gitosis@dmz-portal.mips.com:llvm.git 8e211187b501bc73edb938fde0019c9a20bcffd5)"}
diff --git a/test/CodeGen/Mips/const4a.ll b/test/CodeGen/Mips/const4a.ll
new file mode 100644
index 0000000..0332327
--- /dev/null
+++ b/test/CodeGen/Mips/const4a.ll
@@ -0,0 +1,180 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands -mips-constant-islands-no-load-relaxation  < %s | FileCheck %s -check-prefix=no-load-relax
+
+; ModuleID = 'const4.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+@i = common global i32 0, align 4
+@b = common global i32 0, align 4
+@j = common global i32 0, align 4
+@k = common global i32 0, align 4
+@l = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @t() #0 {
+entry:
+  store i32 -559023410, i32* @i, align 4
+  %0 = load i32* @b, align 4
+; no-load-relax	lw	${{[0-9]+}}, $CPI0_1	# 16 bit inst
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.else
+; no-load-relax:	beqz	${{[0-9]+}}, $BB0_3
+; no-load-relax:	lw	${{[0-9]+}}, %call16(foo)(${{[0-9]+}})
+; no-load-relax:	b	$BB0_4
+; no-load-relax:	.align	2
+; no-load-relax: $CPI0_0:
+; no-load-relax:	.4byte	3735943886
+; no-load-relax: $BB0_3:
+; no-load-relax:	lw	${{[0-9]+}}, %call16(goo)(${{[0-9]+}})
+if.then:                                          ; preds = %entry
+  call void bitcast (void (...)* @foo to void ()*)()
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  call void bitcast (void (...)* @goo to void ()*)()
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  ret void
+}
+
+declare void @foo(...) #1
+
+declare void @goo(...) #1
+
+declare void @hoo(...) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b310439121c875937d78cc49cc969bc1197fc025) (gitosis@dmz-portal.mips.com:llvm.git 7fc0ca9656ebec8dad61f72f5a5ddfb232c070fd)"}
diff --git a/test/CodeGen/Mips/const6.ll b/test/CodeGen/Mips/const6.ll
new file mode 100644
index 0000000..20cdc09
--- /dev/null
+++ b/test/CodeGen/Mips/const6.ll
@@ -0,0 +1,164 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands -mips-constant-islands-no-load-relaxation  < %s | FileCheck %s -check-prefix=no-load-relax
+
+; ModuleID = 'const6.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+@i = common global i32 0, align 4
+@j = common global i32 0, align 4
+@k = common global i32 0, align 4
+@l = common global i32 0, align 4
+@b = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @t() #0 {
+entry:
+  store i32 -559023410, i32* @i, align 4
+; load-relax: 	lw	${{[0-9]+}}, $CPI0_0
+; load-relax:	jrc	 $ra
+; load-relax:	.align	2
+; load-relax: $CPI0_0:
+; load-relax:	.4byte	3735943886
+; load-relax:	.end	t
+
+; no-load-relax: lw	${{[0-9]+}}, $CPI0_1	# 16 bit inst
+; no-load-relax:	jalrc 	${{[0-9]+}}
+; no-load-relax:	b	$BB0_2
+; no-load-relax:	.align	2
+; no-load-relax: $CPI0_0:
+; no-load-relax:	.4byte	3735943886
+; no-load-relax: $BB0_2:
+
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  call void bitcast (void (...)* @hoo to void ()*)()
+  ret void
+}
+
+declare void @hoo(...) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.4 (gitosis@dmz-portal.mips.com:clang.git b310439121c875937d78cc49cc969bc1197fc025) (gitosis@dmz-portal.mips.com:llvm.git 7fc0ca9656ebec8dad61f72f5a5ddfb232c070fd)"}
+
+
diff --git a/test/CodeGen/Mips/const6a.ll b/test/CodeGen/Mips/const6a.ll
new file mode 100644
index 0000000..8b402ac
--- /dev/null
+++ b/test/CodeGen/Mips/const6a.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax1
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=load-relax
+
+; ModuleID = 'const6a.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+@i = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @t() #0 {
+entry:
+  store i32 -559023410, i32* @i, align 4
+; load-relax-NOT: 	lw	${{[0-9]+}}, $CPI0_0 # 16 bit inst
+; load-relax1: lw	${{[0-9]+}}, $CPI0_0
+; load-relax:	jrc	 $ra
+; load-relax:	.align	2
+; load-relax: $CPI0_0:
+; load-relax:	.4byte	3735943886
+; load-relax:	.end	t
+  call void asm sideeffect ".space 40000", ""() #1, !srcloc !1
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind }
+
+!1 = metadata !{i32 121}
diff --git a/test/CodeGen/Mips/ctlz.ll b/test/CodeGen/Mips/ctlz.ll
new file mode 100644
index 0000000..2ddb727
--- /dev/null
+++ b/test/CodeGen/Mips/ctlz.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32 -mattr=+mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=static
+
+@x = global i32 28912, align 4
+@y = common global i32 0, align 4
+
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @x, align 4
+  %1 = call i32 @llvm.ctlz.i32(i32 %0, i1 true)
+  store i32 %1, i32* @y, align 4
+  ret i32 0
+}
+
+; static: .end main
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.ctlz.i32(i32, i1) #1
+
+
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind readnone }
+
diff --git a/test/CodeGen/Mips/disable-tail-merge.ll b/test/CodeGen/Mips/disable-tail-merge.ll
new file mode 100644
index 0000000..b4c093a
--- /dev/null
+++ b/test/CodeGen/Mips/disable-tail-merge.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+@g0 = common global i32 0, align 4
+@g1 = common global i32 0, align 4
+
+; CHECK: addiu ${{[0-9]+}}, ${{[0-9]+}}, 23
+; CHECK: addiu ${{[0-9]+}}, ${{[0-9]+}}, 23
+
+define i32 @test1(i32 %a) {
+entry:
+  %tobool = icmp eq i32 %a, 0
+  %0 = load i32* @g0, align 4
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  %add = add nsw i32 %0, 1
+  store i32 %add, i32* @g0, align 4
+  %1 = load i32* @g1, align 4
+  %add1 = add nsw i32 %1, 23
+  br label %if.end
+
+if.else:
+  %add2 = add nsw i32 %0, 11
+  store i32 %add2, i32* @g0, align 4
+  %2 = load i32* @g1, align 4
+  %add3 = add nsw i32 %2, 23
+  br label %if.end
+
+if.end:
+  %storemerge = phi i32 [ %add3, %if.else ], [ %add1, %if.then ]
+  store i32 %storemerge, i32* @g1, align 4
+  ret i32 %storemerge
+}
diff --git a/test/CodeGen/Mips/divrem.ll b/test/CodeGen/Mips/divrem.ll
index a983c46..b631c3b 100644
--- a/test/CodeGen/Mips/divrem.ll
+++ b/test/CodeGen/Mips/divrem.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=mips < %s | FileCheck %s -check-prefix=TRAP
+; RUN: llc -march=mips -verify-machineinstrs < %s |\
+; RUN: FileCheck %s -check-prefix=TRAP
 ; RUN: llc -march=mips -mno-check-zero-division < %s |\
 ; RUN: FileCheck %s -check-prefix=NOCHECK
 
@@ -11,6 +12,9 @@
 ; NOCHECK-NOT: teq
 ; NOCHECK: .end sdiv1
 
+@g0 = common global i32 0, align 4
+@g1 = common global i32 0, align 4
+
 define i32 @sdiv1(i32 %a0, i32 %a1) nounwind readnone {
 entry:
   %div = sdiv i32 %a0, %a1
@@ -67,3 +71,11 @@ entry:
   %div = udiv i32 %a0, %a1
   ret i32 %div
 }
+
+define i32 @killFlags() {
+entry:
+  %0 = load i32* @g0, align 4
+  %1 = load i32* @g1, align 4
+  %div = sdiv i32 %0, %1
+  ret i32 %div
+}
diff --git a/test/CodeGen/Mips/extins.ll b/test/CodeGen/Mips/extins.ll
index a164f70..efaeeea 100644
--- a/test/CodeGen/Mips/extins.ll
+++ b/test/CodeGen/Mips/extins.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s
+; RUN: llc  < %s -march=mips -mcpu=mips32r2 | FileCheck %s -check-prefix=32R2
+; RUN: llc  < %s -march=mips -mcpu=mips16 | FileCheck %s -check-prefix=16
 
 define i32 @ext0_5_9(i32 %s, i32 %pos, i32 %sz) nounwind readnone {
 entry:
-; CHECK: ext ${{[0-9]+}}, $4, 5, 9
+; 32R2: ext ${{[0-9]+}}, $4, 5, 9
+; 16-NOT: ext ${{[0-9]+}}
   %shr = lshr i32 %s, 5
   %and = and i32 %shr, 511
   ret i32 %and
@@ -10,7 +12,8 @@ entry:
 
 define void @ins2_5_9(i32 %s, i32* nocapture %d) nounwind {
 entry:
-; CHECK: ins ${{[0-9]+}}, $4, 5, 9
+; 32R2: ins ${{[0-9]+}}, $4, 5, 9
+; 16-NOT: ins ${{[0-9]+}}
   %and = shl i32 %s, 5
   %shl = and i32 %and, 16352
   %tmp3 = load i32* %d, align 4
diff --git a/test/CodeGen/Mips/f16abs.ll b/test/CodeGen/Mips/f16abs.ll
new file mode 100644
index 0000000..928914f
--- /dev/null
+++ b/test/CodeGen/Mips/f16abs.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static < %s | FileCheck %s -check-prefix=static
+
+@y = global double -1.450000e+00, align 8
+@x = common global double 0.000000e+00, align 8
+
+@y1 = common global float 0.000000e+00, align 4
+@x1 = common global float 0.000000e+00, align 4
+
+
+
+; Function Attrs: nounwind optsize
+define i32 @main() #0 {
+entry:
+  %0 = load double* @y, align 8
+  %call = tail call double @fabs(double %0) #2
+  store double %call, double* @x, align 8
+; static-NOT: 	.ent	__call_stub_fp_fabs
+; static-NOT: 	jal fabs
+  %1 = load float* @y1, align 4
+  %call2 = tail call float @fabsf(float %1) #2
+  store float %call2, float* @x1, align 4
+; static-NOT: 	.ent	__call_stub_fp_fabsf
+; static-NOT: 	jal fabsf
+  ret i32 0
+}
+
+; Function Attrs: nounwind optsize readnone
+declare double @fabs(double) #1
+
+declare float @fabsf(float) #1
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind optsize readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #2 = { nounwind optsize readnone }
+
+
+
diff --git a/test/CodeGen/Mips/fixdfsf.ll b/test/CodeGen/Mips/fixdfsf.ll
new file mode 100644
index 0000000..b08eefd
--- /dev/null
+++ b/test/CodeGen/Mips/fixdfsf.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic < %s | FileCheck %s -check-prefix=pic1
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic < %s | FileCheck %s -check-prefix=pic2
+
+@x = common global double 0.000000e+00, align 8
+@y = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define void @foo()  {
+entry:
+  %0 = load double* @x, align 8
+  %conv = fptoui double %0 to i32
+  store i32 %conv, i32* @y, align 4
+; pic1:	lw	${{[0-9]+}}, %call16(__fixunsdfsi)(${{[0-9]+}})
+; pic2:	lw	${{[0-9]+}}, %got(__mips16_call_stub_2)(${{[0-9]+}})
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/fp16instrinsmc.ll b/test/CodeGen/Mips/fp16instrinsmc.ll
index 3c01d56..bb43d27 100644
--- a/test/CodeGen/Mips/fp16instrinsmc.ll
+++ b/test/CodeGen/Mips/fp16instrinsmc.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic < %s | FileCheck %s -check-prefix=pic
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=1010111 -mips-os16 < %s | FileCheck %s -check-prefix=fmask 
 
 @x = global float 1.500000e+00, align 4
 @xn = global float -1.900000e+01, align 4
@@ -13,6 +14,14 @@
 
 ; Function Attrs: nounwind
 define void @foo1() #0 {
+; fmask: .ent foo1
+; fmask: .set	noreorder
+; fmask: .set	nomacro
+; fmask: .set	noat
+; fmask: .set	at
+; fmask: .set	macro
+; fmask: .set	reorder
+; fmask: .end	foo1
 entry:
   %0 = load float* @x, align 4
   %1 = load float* @one, align 4
@@ -26,6 +35,9 @@ declare float @copysignf(float, float) #1
 
 ; Function Attrs: nounwind
 define void @foo2() #0 {
+; fmask:	.ent	foo2
+; fmask:	save	{{.*}}
+; fmask:	.end	foo2
 entry:
   %0 = load float* @x, align 4
   %1 = load float* @negone, align 4
@@ -37,6 +49,14 @@ entry:
 ; Function Attrs: nounwind
 define void @foo3() #0 {
 entry:
+; fmask: .ent foo3
+; fmask: .set	noreorder
+; fmask: .set	nomacro
+; fmask: .set	noat
+; fmask: .set	at
+; fmask: .set	macro
+; fmask: .set	reorder
+; fmask: .end	foo3
   %0 = load double* @xd, align 8
   %1 = load float* @oned, align 4
   %conv = fpext float %1 to double
@@ -51,6 +71,9 @@ declare double @copysign(double, double) #1
 ; Function Attrs: nounwind
 define void @foo4() #0 {
 entry:
+; fmask:	.ent	foo4
+; fmask:	save	{{.*}}
+; fmask:	.end	foo4
   %0 = load double* @xd, align 8
   %1 = load double* @negoned, align 8
   %call = call double @copysign(double %0, double %1) #2
@@ -362,7 +385,7 @@ entry:
 ; Function Attrs: nounwind
 declare double @exp2(double) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/test/CodeGen/Mips/fp16mix.ll b/test/CodeGen/Mips/fp16mix.ll
new file mode 100644
index 0000000..8d85099
--- /dev/null
+++ b/test/CodeGen/Mips/fp16mix.ll
@@ -0,0 +1,92 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=10 -mips-os16 < %s | FileCheck %s -check-prefix=fmask1
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=01 -mips-os16 < %s | FileCheck %s -check-prefix=fmask2 
+
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static -mips32-function-mask=10. -mips-os16 < %s | FileCheck %s -check-prefix=fmask1nr
+
+; Function Attrs: nounwind optsize readnone
+define void @foo1()  {
+entry:
+  ret void
+; fmask1: .ent foo1
+; fmask1: .set	noreorder
+; fmask1: .set	nomacro
+; fmask1: .set	noat
+; fmask1: .set	at
+; fmask1: .set	macro
+; fmask1: .set	reorder
+; fmask1: .end	foo1
+; fmask2: .ent	foo1
+; fmask2: save	{{.*}}
+; fmask2: .end	foo1
+; fmask1nr: .ent foo1
+; fmask1nr: .set	noreorder
+; fmask1nr: .set	nomacro
+; fmask1nr: .set	noat
+; fmask1nr: .set	at
+; fmask1nr: .set	macro
+; fmask1nr: .set	reorder
+; fmask1nr: .end	foo1
+}
+
+; Function Attrs: nounwind optsize readnone
+define void @foo2()  {
+entry:
+  ret void
+; fmask2: .ent foo2
+; fmask2: .set	noreorder
+; fmask2: .set	nomacro
+; fmask2: .set	noat
+; fmask2: .set	at
+; fmask2: .set	macro
+; fmask2: .set	reorder
+; fmask2: .end	foo2
+; fmask1: .ent	foo2
+; fmask1: save	{{.*}}
+; fmask1: .end	foo2
+; fmask1nr: .ent	foo2
+; fmask1nr: save	{{.*}}
+; fmask1nr: .end	foo2
+}
+
+; Function Attrs: nounwind optsize readnone
+define void @foo3()  {
+entry:
+  ret void
+; fmask1: .ent foo3
+; fmask1: .set	noreorder
+; fmask1: .set	nomacro
+; fmask1: .set	noat
+; fmask1: .set	at
+; fmask1: .set	macro
+; fmask1: .set	reorder
+; fmask1: .end	foo3
+; fmask2:  .ent	foo3
+; fmask2:  save	{{.*}}
+; fmask2:  .end	foo3
+; fmask1r:  .ent	foo3
+; fmask1r:  save	{{.*}}
+; fmask1r:  .end	foo3
+}
+
+; Function Attrs: nounwind optsize readnone
+define void @foo4()  {
+entry:
+  ret void
+; fmask2: .ent foo4
+; fmask2: .set	noreorder
+; fmask2: .set	nomacro
+; fmask2: .set	noat
+; fmask2: .set	at
+; fmask2: .set	macro
+; fmask2: .set	reorder
+; fmask2: .end	foo4
+; fmask1: .ent	foo4
+; fmask1: save	{{.*}}
+; fmask1: .end	foo4
+; fmask1nr: .ent	foo4
+; fmask1nr: save	{{.*}}
+; fmask1nr: .end	foo4
+}
+
+
diff --git a/test/CodeGen/Mips/fpneeded.ll b/test/CodeGen/Mips/fpneeded.ll
index 623883a..dcdebb9 100644
--- a/test/CodeGen/Mips/fpneeded.ll
+++ b/test/CodeGen/Mips/fpneeded.ll
@@ -131,7 +131,7 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	foo3
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 define void @vv() #0 {
 entry:
diff --git a/test/CodeGen/Mips/fpnotneeded.ll b/test/CodeGen/Mips/fpnotneeded.ll
index dc2ec10..b4fab64 100644
--- a/test/CodeGen/Mips/fpnotneeded.ll
+++ b/test/CodeGen/Mips/fpnotneeded.ll
@@ -57,7 +57,7 @@ entry:
 ; 32:	restore	{{.+}} 
 ; 32:	.end	foo
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 
 define float @fv() #0 {
diff --git a/test/CodeGen/Mips/fptr2.ll b/test/CodeGen/Mips/fptr2.ll
new file mode 100644
index 0000000..77028db
--- /dev/null
+++ b/test/CodeGen/Mips/fptr2.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=static  < %s | FileCheck %s -check-prefix=static16
+
+; Function Attrs: nounwind
+define double @my_mul(double %a, double %b) #0 {
+entry:
+  %a.addr = alloca double, align 8
+  %b.addr = alloca double, align 8
+  store double %a, double* %a.addr, align 8
+  store double %b, double* %b.addr, align 8
+  %0 = load double* %a.addr, align 8
+  %1 = load double* %b.addr, align 8
+  %mul = fmul double %0, %1
+  ret double %mul
+}
+
+; static16: 	        .ent	__fn_stub_my_mul
+; static16:     	.set reorder
+; static16-NEXT:	#NO_APP
+; static16: 	        .end __fn_stub_my_mul
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
diff --git a/test/CodeGen/Mips/helloworld.ll b/test/CodeGen/Mips/helloworld.ll
index 83c88ae..058a041 100644
--- a/test/CodeGen/Mips/helloworld.ll
+++ b/test/CodeGen/Mips/helloworld.ll
@@ -1,11 +1,11 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C1
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C2
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PE
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST1
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST2
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C1
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=C2
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PE
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST1
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=static -O3 < %s | FileCheck %s -check-prefix=ST2
 ;
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR
-; RUN: llc  -march=mipsel -mcpu=mips32  -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR32
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32  -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=SR32
 
 
 @.str = private unnamed_addr constant [13 x i8] c"hello world\0A\00", align 1
@@ -26,9 +26,11 @@ entry:
 ; SR32:  .set nomacro
 ; SR32:  .set noat
 ; SR:	save 	$ra, $s0, $s1, $s2, [[FS:[0-9]+]]
-; PE:	li	$[[T1:[0-9]+]], %hi(_gp_disp)
-; PE: 	addiu	$[[T2:[0-9]+]], $pc, %lo(_gp_disp)
-; PE:	sll	$[[T3:[0-9]+]], $[[T1]], 16
+; PE:    .ent main
+; PE:    .align  2
+; PE-NEXT:	li	$[[T1:[0-9]+]], %hi(_gp_disp)
+; PE-NEXT: 	addiu	$[[T2:[0-9]+]], $pc, %lo(_gp_disp)
+; PE:	        sll	$[[T3:[0-9]+]], $[[T1]], 16
 ; C1:	lw	${{[0-9]+}}, %got($.str)(${{[0-9]+}})
 ; C2:	lw	${{[0-9]+}}, %call16(printf)(${{[0-9]+}})
 ; C1:	addiu	${{[0-9]+}}, %lo($.str)
diff --git a/test/CodeGen/Mips/hf16call32.ll b/test/CodeGen/Mips/hf16call32.ll
index 934cf06..461438e 100644
--- a/test/CodeGen/Mips/hf16call32.ll
+++ b/test/CodeGen/Mips/hf16call32.ll
@@ -1026,5 +1026,5 @@ declare { double, double } @dc_sf(float) #1
 ; stel:	jr $18
 ; stel:	.end	__call_stub_fp_dc_sf
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/hf16call32_body.ll b/test/CodeGen/Mips/hf16call32_body.ll
index 793b771..34bae26 100644
--- a/test/CodeGen/Mips/hf16call32_body.ll
+++ b/test/CodeGen/Mips/hf16call32_body.ll
@@ -20,7 +20,7 @@ entry:
 }
 ; stel: .section	.mips16.fn.v_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_sf
-; stel:		la $25, v_sf
+; stel:		la $25,v_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		jr $25
 ; stel:		__fn_local_v_sf = v_sf
@@ -40,7 +40,7 @@ entry:
 
 ; stel: .section	.mips16.fn.v_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_df
-; stel:		la $25, v_df
+; stel:		la $25,v_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		jr $25
@@ -63,7 +63,7 @@ entry:
 
 ; stel: .section	.mips16.fn.v_sf_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_sf_sf
-; stel:		la $25, v_sf_sf
+; stel:		la $25,v_sf_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f14
 ; stel:		jr $25
@@ -86,7 +86,7 @@ entry:
 
 ; stel: .section	.mips16.fn.v_sf_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_sf_df
-; stel:		la $25, v_sf_df
+; stel:		la $25,v_sf_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $6,$f14
 ; stel:		mfc1 $7,$f15
@@ -110,7 +110,7 @@ entry:
 
 ; stel: .section	.mips16.fn.v_df_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_df_sf
-; stel:		la $25, v_df_sf
+; stel:		la $25,v_df_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		mfc1 $6,$f14
@@ -134,7 +134,7 @@ entry:
 
 ; stel: .section	.mips16.fn.v_df_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_v_df_df
-; stel:		la $25, v_df_df
+; stel:		la $25,v_df_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		mfc1 $6,$f14
@@ -164,7 +164,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_sf
-; stel:		la $25, sf_sf
+; stel:		la $25,sf_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		jr $25
 ; stel:		__fn_local_sf_sf = sf_sf
@@ -184,7 +184,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_df
-; stel:		la $25, sf_df
+; stel:		la $25,sf_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		jr $25
@@ -208,7 +208,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_sf_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_sf_sf
-; stel:		la $25, sf_sf_sf
+; stel:		la $25,sf_sf_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f14
 ; stel:		jr $25
@@ -232,7 +232,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_sf_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_sf_df
-; stel:		la $25, sf_sf_df
+; stel:		la $25,sf_sf_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $6,$f14
 ; stel:		mfc1 $7,$f15
@@ -257,7 +257,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_df_sf,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_df_sf
-; stel:		la $25, sf_df_sf
+; stel:		la $25,sf_df_sf
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		mfc1 $6,$f14
@@ -282,7 +282,7 @@ entry:
 
 ; stel: .section	.mips16.fn.sf_df_df,"ax",@progbits
 ; stel:	.ent	__fn_stub_sf_df_df
-; stel:		la $25, sf_df_df
+; stel:		la $25,sf_df_df
 ; stel:		mfc1 $4,$f12
 ; stel:		mfc1 $5,$f13
 ; stel:		mfc1 $6,$f14
@@ -291,4 +291,4 @@ entry:
 ; stel:		__fn_local_sf_df_df = sf_df_df
 ; stel:	.end	__fn_stub_sf_df_df
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/hf1_body.ll b/test/CodeGen/Mips/hf1_body.ll
new file mode 100644
index 0000000..b2cce92
--- /dev/null
+++ b/test/CodeGen/Mips/hf1_body.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16  -relocation-model=pic -soft-float -mips16-hard-float < %s | FileCheck %s -check-prefix=picfp16
+
+@x = external global float
+
+; Function Attrs: nounwind
+define void @v_sf(float %p) #0 {
+entry:
+  %p.addr = alloca float, align 4
+  store float %p, float* %p.addr, align 4
+  %0 = load float* %p.addr, align 4
+  store float %0, float* @x, align 4
+  ret void
+}
+; picfp16:	.ent	__fn_stub_v_sf
+; picfp16:	.cpload  $25
+; picfp16:	.set reorder
+; picfp16:	.reloc 0,R_MIPS_NONE,v_sf
+; picfp16: 	la $25,$__fn_local_v_sf
+; picfp16: 	mfc1 $4,$f12
+; picfp16: 	jr $25
+; picfp16: 	.end	__fn_stub_v_sf
diff --git a/test/CodeGen/Mips/hfptrcall.ll b/test/CodeGen/Mips/hfptrcall.ll
index b1d36c0..25639da 100644
--- a/test/CodeGen/Mips/hfptrcall.ll
+++ b/test/CodeGen/Mips/hfptrcall.ll
@@ -118,8 +118,8 @@ entry:
 
 declare i32 @printf(i8*, ...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="true" }
 
 
 
diff --git a/test/CodeGen/Mips/i32k.ll b/test/CodeGen/Mips/i32k.ll
index c6da8b1..f4dd1eb 100644
--- a/test/CodeGen/Mips/i32k.ll
+++ b/test/CodeGen/Mips/i32k.ll
@@ -1,16 +1,23 @@
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16a
-; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16b
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
 
 define i32 @main() nounwind {
 entry:
   %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 1075344593) nounwind
-; 16a:	li	${{[0-9]+}}, 29905
-; 16b:	li	${{[0-9]+}}, 16408
+; 16:	lw	${{[0-9]+}}, 1f
+; 16:	b	2f
+; 16:	.align	2
+; 16: 1: 	.word	1075344593
+; 16: 2:
+
   %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 -1075344593) nounwind
-; 16a:	li	${{[0-9]+}}, 49127
-; 16b:	li	${{[0-9]+}}, 35631
+
+; 16:	lw	${{[0-9]+}}, 1f
+; 16:	b	2f
+; 16:	.align	2
+; 16: 1: 	.word	-1075344593
+; 16: 2:
   ret i32 0
 }
 
diff --git a/test/CodeGen/Mips/i64arg.ll b/test/CodeGen/Mips/i64arg.ll
index 0b16424..5b2d135 100644
--- a/test/CodeGen/Mips/i64arg.ll
+++ b/test/CodeGen/Mips/i64arg.ll
@@ -2,18 +2,18 @@
 
 define void @f1(i64 %ll1, float %f, i64 %ll, i32 %i, float %f2) nounwind {
 entry:
-; CHECK: move $[[R1:[0-9]+]], $5
-; CHECK: move $[[R0:[0-9]+]], $4
-; CHECK: ori $6, ${{[0-9]+}}, 3855
-; CHECK: ori $7, ${{[0-9]+}}, 22136
-; CHECK: lw  $25, %call16(ff1)
+; CHECK-DAG: lw $[[R2:[0-9]+]], 80($sp)
+; CHECK-DAG: lw $[[R3:[0-9]+]], 84($sp)
+; CHECK-DAG: move $[[R1:[0-9]+]], $5
+; CHECK-DAG: move $[[R0:[0-9]+]], $4
+; CHECK-DAG: ori $6, ${{[0-9]+}}, 3855
+; CHECK-DAG: ori $7, ${{[0-9]+}}, 22136
+; CHECK-DAG: lw  $25, %call16(ff1)
 ; CHECK: jalr
   tail call void @ff1(i32 %i, i64 1085102592623924856) nounwind
-; CHECK: lw $25, %call16(ff2)
-; CHECK: lw $[[R2:[0-9]+]], 80($sp)
-; CHECK: lw $[[R3:[0-9]+]], 84($sp)
-; CHECK: move $4, $[[R2]]
-; CHECK: move $5, $[[R3]]
+; CHECK-DAG: lw $25, %call16(ff2)
+; CHECK-DAG: move $4, $[[R2]]
+; CHECK-DAG: move $5, $[[R3]]
 ; CHECK: jalr $25
   tail call void @ff2(i64 %ll, double 3.000000e+00) nounwind
   %sub = add nsw i32 %i, -1
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
index 1e96346..09fee3d 100644
--- a/test/CodeGen/Mips/largeimmprinting.ll
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -18,11 +18,11 @@ entry:
 ; 64:  dsll  $[[R0]], $[[R0]], 48
 ; 64:  daddiu  $[[R0]], $[[R0]], -1
 ; 64:  dsll  $[[R0]], $[[R0]], 16
-; 64:  daddiu  $[[R0]], $[[R0]], -48
+; 64:  daddiu  $[[R0]], $[[R0]], -32
 ; 64:  daddu $sp, $sp, $[[R0]]
 ; 64:  lui $[[R1:[0-9]+]], 1
 ; 64:  daddu $[[R1]], $sp, $[[R1]]
-; 64:  sd  $ra, 40($[[R1]])
+; 64:  sd  $ra, 24($[[R1]])
 
   %agg.tmp = alloca %struct.S1, align 1
   %tmp = getelementptr inbounds %struct.S1* %agg.tmp, i32 0, i32 0, i32 0
diff --git a/test/CodeGen/Mips/lazy-binding.ll b/test/CodeGen/Mips/lazy-binding.ll
new file mode 100644
index 0000000..839155a
--- /dev/null
+++ b/test/CodeGen/Mips/lazy-binding.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+; CHECK-LABEL: foo6:
+; CHECK: %while.body
+; CHECK: lw  $25, %call16(foo2)(${{[0-9]+}})
+; CHECK: jalr $25
+; CHECK: %while.end
+
+define void @foo6(i32 %n) {
+entry:
+  %tobool1 = icmp eq i32 %n, 0
+  br i1 %tobool1, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %n.addr.02 = phi i32 [ %dec, %while.body ], [ %n, %entry ]
+  %dec = add nsw i32 %n.addr.02, -1
+  tail call void @foo2()
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+declare void @foo2()
+
+; CHECK-LABEL: foo1:
+; CHECK: lw $25, %call16(foo2)(${{[0-9]+}})
+; CHECK: jalr $25
+; CHECK: lw $25, %call16(foo2)(${{[0-9]+}})
+; CHECK: jalr $25
+; CHECK: lw $25, %call16(foo2)(${{[0-9]+}})
+; CHECK: jalr $25
+
+define void @foo1() {
+entry:
+  tail call void @foo2()
+  tail call void @foo2()
+  tail call void @foo2()
+  ret void
+}
diff --git a/test/CodeGen/Mips/lit.local.cfg b/test/CodeGen/Mips/lit.local.cfg
index e157c54..1fa54b4 100644
--- a/test/CodeGen/Mips/lit.local.cfg
+++ b/test/CodeGen/Mips/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'Mips' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 1a4f79c..af192d0 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -1,13 +1,17 @@
-; RUN: llc -march=mipsel -force-mips-long-branch < %s | FileCheck %s -check-prefix=O32
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mipsel -force-mips-long-branch -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=O32
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -disable-mips-delay-filler < %s | FileCheck %s -check-prefix=N64
 
 @g0 = external global i32
 
 define void @foo1(i32 %s) nounwind {
 entry:
+; O32: nop
+; O32: addiu $sp, $sp, -8
 ; O32: bal
 ; O32: lui $1, 0
 ; O32: addiu $1, $1, {{[0-9]+}} 
+; N64: nop
+; N64: daddiu $sp, $sp, -16
 ; N64: lui $1, 0
 ; N64: daddiu $1, $1, 0
 ; N64: dsll $1, $1, 16
diff --git a/test/CodeGen/Mips/mips16_32_1.ll b/test/CodeGen/Mips/mips16_32_1.ll
index 6f4826e..e156641 100644
--- a/test/CodeGen/Mips/mips16_32_1.ll
+++ b/test/CodeGen/Mips/mips16_32_1.ll
@@ -11,4 +11,4 @@ entry:
 ; CHECK:	save	{{.+}}
 ; CHECK:	restore	{{.+}} 
 ; CHECK:	.end	foo
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_10.ll b/test/CodeGen/Mips/mips16_32_10.ll
index 330dbfe..7c017b8 100644
--- a/test/CodeGen/Mips/mips16_32_10.ll
+++ b/test/CodeGen/Mips/mips16_32_10.ll
@@ -54,6 +54,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_3.ll b/test/CodeGen/Mips/mips16_32_3.ll
index 8874a88..dd94ec1 100644
--- a/test/CodeGen/Mips/mips16_32_3.ll
+++ b/test/CodeGen/Mips/mips16_32_3.ll
@@ -65,6 +65,6 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	main
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_4.ll b/test/CodeGen/Mips/mips16_32_4.ll
index cdaed6c..5e49071 100644
--- a/test/CodeGen/Mips/mips16_32_4.ll
+++ b/test/CodeGen/Mips/mips16_32_4.ll
@@ -60,6 +60,6 @@ entry:
 ; 32:	.end	main
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_5.ll b/test/CodeGen/Mips/mips16_32_5.ll
index 45e0bf4..17900a2 100644
--- a/test/CodeGen/Mips/mips16_32_5.ll
+++ b/test/CodeGen/Mips/mips16_32_5.ll
@@ -75,6 +75,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_6.ll b/test/CodeGen/Mips/mips16_32_6.ll
index f4b8e7a..a77031a 100644
--- a/test/CodeGen/Mips/mips16_32_6.ll
+++ b/test/CodeGen/Mips/mips16_32_6.ll
@@ -81,6 +81,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false"  "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false"  "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_7.ll b/test/CodeGen/Mips/mips16_32_7.ll
index f8726ea..895b5d4 100644
--- a/test/CodeGen/Mips/mips16_32_7.ll
+++ b/test/CodeGen/Mips/mips16_32_7.ll
@@ -71,6 +71,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false"  "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false"  "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_8.ll b/test/CodeGen/Mips/mips16_32_8.ll
index e51f296..4152d68 100644
--- a/test/CodeGen/Mips/mips16_32_8.ll
+++ b/test/CodeGen/Mips/mips16_32_8.ll
@@ -68,7 +68,7 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	main
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips16_32_9.ll b/test/CodeGen/Mips/mips16_32_9.ll
index f5ff368..c9b494f 100644
--- a/test/CodeGen/Mips/mips16_32_9.ll
+++ b/test/CodeGen/Mips/mips16_32_9.ll
@@ -46,6 +46,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Mips/mips64instrs.ll b/test/CodeGen/Mips/mips64instrs.ll
index 7b06c2d..2894d69 100644
--- a/test/CodeGen/Mips/mips64instrs.ll
+++ b/test/CodeGen/Mips/mips64instrs.ll
@@ -1,4 +1,7 @@
-; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64 -verify-machineinstrs < %s | FileCheck %s
+
+@gll0 = common global i64 0, align 8
+@gll1 = common global i64 0, align 8
 
 define i64 @f0(i64 %a0, i64 %a1) nounwind readnone {
 entry:
@@ -90,17 +93,21 @@ entry:
 ; CHECK: ddiv $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
 ; CHECK: teq $[[R0]], $zero, 7
 ; CHECK: mflo
-  %div = sdiv i64 %a, %b
+  %0 = load i64* @gll0, align 8
+  %1 = load i64* @gll1, align 8
+  %div = sdiv i64 %0, %1
   ret i64 %div
 }
 
-define i64 @f15(i64 %a, i64 %b) nounwind readnone {
+define i64 @f15() nounwind readnone {
 entry:
 ; CHECK-LABEL: f15:
 ; CHECK: ddivu $zero, ${{[0-9]+}}, $[[R0:[0-9]+]]
 ; CHECK: teq $[[R0]], $zero, 7
 ; CHECK: mflo
-  %div = udiv i64 %a, %b
+  %0 = load i64* @gll0, align 8
+  %1 = load i64* @gll1, align 8
+  %div = udiv i64 %0, %1
   ret i64 %div
 }
 
@@ -148,4 +155,3 @@ entry:
   %neg = xor i64 %or, -1
   ret i64 %neg
 }
-
diff --git a/test/CodeGen/Mips/mno-ldc1-sdc1.ll b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
index be9d0b6..f4854f8 100644
--- a/test/CodeGen/Mips/mno-ldc1-sdc1.ll
+++ b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
@@ -1,22 +1,31 @@
-; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 < %s | \
-; RUN: FileCheck %s -check-prefix=LE-PIC
+; RUN: llc -march=mipsel -relocation-model=pic -mno-ldc1-sdc1 -mcpu=mips32r2 \
+; RUN: < %s | FileCheck %s -check-prefix=LE-PIC
 ; RUN: llc -march=mipsel -relocation-model=static -mno-ldc1-sdc1 < %s | \
 ; RUN: FileCheck %s -check-prefix=LE-STATIC
 ; RUN: llc -march=mips -relocation-model=pic -mno-ldc1-sdc1 < %s | \
 ; RUN: FileCheck %s -check-prefix=BE-PIC
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=CHECK-LDC1-SDC1
+; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | \
+; RUN: FileCheck %s -check-prefix=CHECK-LDC1-SDC1
 
 @g0 = common global double 0.000000e+00, align 8
 
 ; LE-PIC-LABEL: test_ldc1:
-; LE-PIC: lwc1 $f0, 0(${{[0-9]+}})
-; LE-PIC: lwc1 $f1, 4(${{[0-9]+}})
+; LE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; LE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; LE-PIC-DAG: mtc1 $[[R0]], $f0
+; LE-PIC-DAG: mtc1 $[[R1]], $f1
 ; LE-STATIC-LABEL: test_ldc1:
-; LE-STATIC: lwc1 $f0, %lo(g0)(${{[0-9]+}})
-; LE-STATIC: lwc1 $f1, %lo(g0+4)(${{[0-9]+}})
+; LE-STATIC-DAG: lui $[[R0:[0-9]+]], %hi(g0)
+; LE-STATIC-DAG: lw $[[R1:[0-9]+]], %lo(g0)($[[R0]])
+; LE-STATIC-DAG: addiu $[[R2:[0-9]+]], $[[R0]], %lo(g0)
+; LE-STATIC-DAG: lw $[[R3:[0-9]+]], 4($[[R2]])
+; LE-STATIC-DAG: mtc1 $[[R1]], $f0
+; LE-STATIC-DAG: mtc1 $[[R3]], $f1
 ; BE-PIC-LABEL: test_ldc1:
-; BE-PIC: lwc1 $f1, 0(${{[0-9]+}})
-; BE-PIC: lwc1 $f0, 4(${{[0-9]+}})
+; BE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; BE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; BE-PIC-DAG: mtc1 $[[R1]], $f0
+; BE-PIC-DAG: mtc1 $[[R0]], $f1
 ; CHECK-LDC1-SDC1-LABEL: test_ldc1:
 ; CHECK-LDC1-SDC1: ldc1 $f{{[0-9]+}}
 
@@ -27,14 +36,22 @@ entry:
 }
 
 ; LE-PIC-LABEL: test_sdc1:
-; LE-PIC: swc1 $f12, 0(${{[0-9]+}})
-; LE-PIC: swc1 $f13, 4(${{[0-9]+}})
+; LE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; LE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; LE-PIC-DAG: sw $[[R0]], 0(${{[0-9]+}})
+; LE-PIC-DAG: sw $[[R1]], 4(${{[0-9]+}})
 ; LE-STATIC-LABEL: test_sdc1:
-; LE-STATIC: swc1 $f12, %lo(g0)(${{[0-9]+}})
-; LE-STATIC: swc1 $f13, %lo(g0+4)(${{[0-9]+}})
+; LE-STATIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; LE-STATIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; LE-STATIC-DAG: lui $[[R2:[0-9]+]], %hi(g0)
+; LE-STATIC-DAG: sw $[[R0]], %lo(g0)($[[R2]])
+; LE-STATIC-DAG: addiu $[[R3:[0-9]+]], $[[R2]], %lo(g0)
+; LE-STATIC-DAG: sw $[[R1]], 4($[[R3]])
 ; BE-PIC-LABEL: test_sdc1:
-; BE-PIC: swc1 $f13, 0(${{[0-9]+}})
-; BE-PIC: swc1 $f12, 4(${{[0-9]+}})
+; BE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; BE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; BE-PIC-DAG: sw $[[R1]], 0(${{[0-9]+}})
+; BE-PIC-DAG: sw $[[R0]], 4(${{[0-9]+}})
 ; CHECK-LDC1-SDC1-LABEL: test_sdc1:
 ; CHECK-LDC1-SDC1: sdc1 $f{{[0-9]+}}
 
@@ -43,3 +60,34 @@ entry:
   store double %a, double* @g0, align 8
   ret void
 }
+
+
+; LE-PIC-LABEL: test_ldxc1:
+; LE-PIC-DAG: lw $[[R0:[0-9]+]], 0(${{[0-9]+}})
+; LE-PIC-DAG: lw $[[R1:[0-9]+]], 4(${{[0-9]+}})
+; LE-PIC-DAG: mtc1 $[[R0]], $f0
+; LE-PIC-DAG: mtc1 $[[R1]], $f1
+; CHECK-LDC1-SDC1-LABEL: test_ldxc1:
+; CHECK-LDC1-SDC1: ldxc1 $f{{[0-9]+}}
+
+define double @test_ldxc1(double* nocapture readonly %a, i32 %i) {
+entry:
+  %arrayidx = getelementptr inbounds double* %a, i32 %i
+  %0 = load double* %arrayidx, align 8
+  ret double %0
+}
+
+; LE-PIC-LABEL: test_sdxc1:
+; LE-PIC-DAG: mfc1 $[[R0:[0-9]+]], $f12
+; LE-PIC-DAG: mfc1 $[[R1:[0-9]+]], $f13
+; LE-PIC-DAG: sw $[[R0]], 0(${{[0-9]+}})
+; LE-PIC-DAG: sw $[[R1]], 4(${{[0-9]+}})
+; CHECK-LDC1-SDC1-LABEL: test_sdxc1:
+; CHECK-LDC1-SDC1: sdxc1 $f{{[0-9]+}}
+
+define void @test_sdxc1(double %b, double* nocapture %a, i32 %i) {
+entry:
+  %arrayidx = getelementptr inbounds double* %a, i32 %i
+  store double %b, double* %arrayidx, align 8
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/2r.ll b/test/CodeGen/Mips/msa/2r.ll
new file mode 100644
index 0000000..da35ad8
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2r.ll
@@ -0,0 +1,257 @@
+; Test the MSA intrinsics that are encoded with the 2R instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_nloc_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_nloc_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_nloc_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_nloc_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.nloc.b(<16 x i8> %0)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_nloc_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.nloc.b(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_nloc_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nloc_b_ARG1)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nloc.b [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nloc_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nloc_b_test
+;
+@llvm_mips_nloc_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_nloc_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_nloc_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_nloc_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.nloc.h(<8 x i16> %0)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_nloc_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.nloc.h(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_nloc_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nloc_h_ARG1)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nloc.h [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nloc_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nloc_h_test
+;
+@llvm_mips_nloc_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_nloc_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_nloc_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_nloc_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.nloc.w(<4 x i32> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_nloc_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.nloc.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_nloc_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nloc_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nloc.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nloc_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nloc_w_test
+;
+@llvm_mips_nloc_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_nloc_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_nloc_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_nloc_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.nloc.d(<2 x i64> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_nloc_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.nloc.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_nloc_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nloc_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nloc.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nloc_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nloc_d_test
+;
+@llvm_mips_nlzc_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_nlzc_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_nlzc_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_nlzc_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.nlzc.b(<16 x i8> %0)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_nlzc_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.nlzc.b(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_nlzc_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nlzc_b_ARG1)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nlzc.b [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nlzc_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nlzc_b_test
+;
+@llvm_mips_nlzc_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_nlzc_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_nlzc_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_nlzc_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.nlzc.h(<8 x i16> %0)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_nlzc_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.nlzc.h(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_nlzc_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nlzc_h_ARG1)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nlzc.h [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nlzc_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nlzc_h_test
+;
+@llvm_mips_nlzc_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_nlzc_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_nlzc_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_nlzc_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.nlzc.w(<4 x i32> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_nlzc_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.nlzc.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_nlzc_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nlzc_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nlzc.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nlzc_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nlzc_w_test
+;
+@llvm_mips_nlzc_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_nlzc_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_nlzc_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_nlzc_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.nlzc.d(<2 x i64> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_nlzc_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.nlzc.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_nlzc_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_nlzc_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: nlzc.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_nlzc_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_nlzc_d_test
+;
+@llvm_mips_pcnt_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_pcnt_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_pcnt_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_pcnt_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.pcnt.b(<16 x i8> %0)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_pcnt_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.pcnt.b(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_pcnt_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_pcnt_b_ARG1)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: pcnt.b [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_pcnt_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_pcnt_b_test
+;
+@llvm_mips_pcnt_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_pcnt_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_pcnt_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_pcnt_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.pcnt.h(<8 x i16> %0)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_pcnt_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.pcnt.h(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_pcnt_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_pcnt_h_ARG1)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: pcnt.h [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_pcnt_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_pcnt_h_test
+;
+@llvm_mips_pcnt_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_pcnt_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_pcnt_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_pcnt_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.pcnt.w(<4 x i32> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_pcnt_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.pcnt.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_pcnt_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_pcnt_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: pcnt.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_pcnt_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_pcnt_w_test
+;
+@llvm_mips_pcnt_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_pcnt_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_pcnt_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_pcnt_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.pcnt.d(<2 x i64> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_pcnt_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.pcnt.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_pcnt_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_pcnt_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: pcnt.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_pcnt_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_pcnt_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2r_vector_scalar.ll b/test/CodeGen/Mips/msa/2r_vector_scalar.ll
new file mode 100644
index 0000000..6f6e1b9
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2r_vector_scalar.ll
@@ -0,0 +1,87 @@
+; Test the MSA intrinsics that are encoded with the 2R instruction format and
+; convert scalars to vectors.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fill_b_ARG1 = global i32 23, align 16
+@llvm_mips_fill_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_fill_b_test() nounwind {
+entry:
+  %0 = load i32* @llvm_mips_fill_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.fill.b(i32 %0)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_fill_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.fill.b(i32) nounwind
+
+; CHECK: llvm_mips_fill_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]],
+; CHECK-DAG: fill.b [[R2:\$w[0-9]+]], [[R1]]
+; CHECK-DAG: st.b [[R2]],
+; CHECK: .size llvm_mips_fill_b_test
+;
+@llvm_mips_fill_h_ARG1 = global i32 23, align 16
+@llvm_mips_fill_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_fill_h_test() nounwind {
+entry:
+  %0 = load i32* @llvm_mips_fill_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.fill.h(i32 %0)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_fill_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.fill.h(i32) nounwind
+
+; CHECK: llvm_mips_fill_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]],
+; CHECK-DAG: fill.h [[R2:\$w[0-9]+]], [[R1]]
+; CHECK-DAG: st.h [[R2]],
+; CHECK: .size llvm_mips_fill_h_test
+;
+@llvm_mips_fill_w_ARG1 = global i32 23, align 16
+@llvm_mips_fill_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fill_w_test() nounwind {
+entry:
+  %0 = load i32* @llvm_mips_fill_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.fill.w(i32 %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_fill_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fill.w(i32) nounwind
+
+; CHECK: llvm_mips_fill_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]],
+; CHECK-DAG: fill.w [[R2:\$w[0-9]+]], [[R1]]
+; CHECK-DAG: st.w [[R2]],
+; CHECK: .size llvm_mips_fill_w_test
+;
+@llvm_mips_fill_d_ARG1 = global i64 23, align 16
+@llvm_mips_fill_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fill_d_test() nounwind {
+entry:
+  %0 = load i64* @llvm_mips_fill_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.fill.d(i64 %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_fill_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fill.d(i64) nounwind
+
+; CHECK: llvm_mips_fill_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], 4(
+; CHECK-DAG: ldi.b [[R3:\$w[0-9]+]], 0
+; CHECK-DAG: insert.w [[R3]][0], [[R1]]
+; CHECK-DAG: insert.w [[R3]][1], [[R2]]
+; CHECK-DAG: insert.w [[R3]][2], [[R1]]
+; CHECK-DAG: insert.w [[R3]][3], [[R2]]
+; CHECK-DAG: st.w [[R3]],
+; CHECK: .size llvm_mips_fill_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf.ll b/test/CodeGen/Mips/msa/2rf.ll
new file mode 100644
index 0000000..b361ef5
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf.ll
@@ -0,0 +1,323 @@
+; Test the MSA intrinsics that are encoded with the 2RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_flog2_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_flog2_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_flog2_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_flog2_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.flog2.w(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_flog2_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.flog2.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_flog2_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_flog2_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: flog2.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_flog2_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_flog2_w_test
+;
+@llvm_mips_flog2_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_flog2_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_flog2_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_flog2_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.flog2.d(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_flog2_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.flog2.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_flog2_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_flog2_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: flog2.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_flog2_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_flog2_d_test
+
+define void @flog2_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_flog2_w_ARG1
+  %1 = tail call <4 x float> @llvm.log2.v4f32(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_flog2_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.log2.v4f32(<4 x float> %val)
+
+; CHECK: flog2_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_flog2_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: flog2.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_flog2_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size flog2_w_test
+
+define void @flog2_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_flog2_d_ARG1
+  %1 = tail call <2 x double> @llvm.log2.v2f64(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_flog2_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.log2.v2f64(<2 x double> %val)
+
+; CHECK: flog2_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_flog2_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: flog2.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_flog2_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size flog2_d_test
+;
+@llvm_mips_frint_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_frint_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_frint_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_frint_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.frint.w(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_frint_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.frint.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_frint_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frint_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frint.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frint_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frint_w_test
+;
+@llvm_mips_frint_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_frint_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_frint_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_frint_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.frint.d(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_frint_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.frint.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_frint_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frint_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frint.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frint_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frint_d_test
+
+define void @frint_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_frint_w_ARG1
+  %1 = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_frint_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) nounwind
+
+; CHECK: frint_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frint_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frint.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frint_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size frint_w_test
+
+define void @frint_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_frint_d_ARG1
+  %1 = tail call <2 x double> @llvm.rint.v2f64(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_frint_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) nounwind
+
+; CHECK: frint_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frint_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frint.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frint_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size frint_d_test
+;
+@llvm_mips_frcp_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_frcp_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_frcp_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_frcp_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.frcp.w(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_frcp_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.frcp.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_frcp_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frcp_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frcp.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frcp_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frcp_w_test
+;
+@llvm_mips_frcp_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_frcp_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_frcp_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_frcp_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.frcp.d(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_frcp_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.frcp.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_frcp_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frcp_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frcp.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frcp_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frcp_d_test
+;
+@llvm_mips_frsqrt_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_frsqrt_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_frsqrt_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_frsqrt_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.frsqrt.w(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_frsqrt_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.frsqrt.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_frsqrt_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frsqrt_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frsqrt.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frsqrt_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frsqrt_w_test
+;
+@llvm_mips_frsqrt_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_frsqrt_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_frsqrt_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_frsqrt_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.frsqrt.d(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_frsqrt_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.frsqrt.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_frsqrt_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_frsqrt_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: frsqrt.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_frsqrt_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_frsqrt_d_test
+;
+@llvm_mips_fsqrt_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsqrt_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fsqrt_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsqrt_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.fsqrt.w(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_fsqrt_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fsqrt.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_fsqrt_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fsqrt_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fsqrt.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_fsqrt_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_fsqrt_w_test
+;
+@llvm_mips_fsqrt_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsqrt_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fsqrt_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsqrt_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.fsqrt.d(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_fsqrt_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fsqrt.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_fsqrt_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fsqrt_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fsqrt.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_fsqrt_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_fsqrt_d_test
+
+define void @fsqrt_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsqrt_w_ARG1
+  %1 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_fsqrt_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind
+
+; CHECK: fsqrt_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fsqrt_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fsqrt.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fsqrt_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size fsqrt_w_test
+
+define void @fsqrt_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsqrt_d_ARG1
+  %1 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_fsqrt_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) nounwind
+
+; CHECK: fsqrt_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fsqrt_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fsqrt.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_fsqrt_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size fsqrt_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf_exup.ll b/test/CodeGen/Mips/msa/2rf_exup.ll
new file mode 100644
index 0000000..8d7cc36
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf_exup.ll
@@ -0,0 +1,82 @@
+; Test the MSA floating point conversion intrinsics (e.g. float->double) that
+; are encoded with the 2RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fexupl_w_ARG1 = global <8 x half> <half 0.000000e+00, half 1.000000e+00, half 2.000000e+00, half 3.000000e+00, half 4.000000e+00, half 5.000000e+00, half 6.000000e+00, half 7.000000e+00>, align 16
+@llvm_mips_fexupl_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fexupl_w_test() nounwind {
+entry:
+  %0 = load <8 x half>* @llvm_mips_fexupl_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.fexupl.w(<8 x half> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_fexupl_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fexupl.w(<8 x half>) nounwind
+
+; CHECK: llvm_mips_fexupl_w_test:
+; CHECK: ld.h
+; CHECK: fexupl.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fexupl_w_test
+;
+@llvm_mips_fexupl_d_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fexupl_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fexupl_d_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fexupl_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.fexupl.d(<4 x float> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_fexupl_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fexupl.d(<4 x float>) nounwind
+
+; CHECK: llvm_mips_fexupl_d_test:
+; CHECK: ld.w
+; CHECK: fexupl.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fexupl_d_test
+;
+@llvm_mips_fexupr_w_ARG1 = global <8 x half> <half 0.000000e+00, half 1.000000e+00, half 2.000000e+00, half 3.000000e+00, half 4.000000e+00, half 5.000000e+00, half 6.000000e+00, half 7.000000e+00>, align 16
+@llvm_mips_fexupr_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fexupr_w_test() nounwind {
+entry:
+  %0 = load <8 x half>* @llvm_mips_fexupr_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.fexupr.w(<8 x half> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_fexupr_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fexupr.w(<8 x half>) nounwind
+
+; CHECK: llvm_mips_fexupr_w_test:
+; CHECK: ld.h
+; CHECK: fexupr.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fexupr_w_test
+;
+@llvm_mips_fexupr_d_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fexupr_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fexupr_d_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fexupr_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.fexupr.d(<4 x float> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_fexupr_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fexupr.d(<4 x float>) nounwind
+
+; CHECK: llvm_mips_fexupr_d_test:
+; CHECK: ld.w
+; CHECK: fexupr.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fexupr_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf_float_int.ll b/test/CodeGen/Mips/msa/2rf_float_int.ll
new file mode 100644
index 0000000..3b5dfda
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf_float_int.ll
@@ -0,0 +1,90 @@
+; Test the MSA integer to floating point conversion intrinsics that are encoded
+; with the 2RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ffint_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ffint_s_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_ffint_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ffint_s_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.ffint.s.w(<4 x i32> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_ffint_s_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.ffint.s.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_ffint_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ffint_s_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ffint_s.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ffint_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ffint_s_w_test
+;
+@llvm_mips_ffint_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ffint_s_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_ffint_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ffint_s_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.ffint.s.d(<2 x i64> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_ffint_s_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.ffint.s.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_ffint_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ffint_s_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ffint_s.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ffint_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ffint_s_d_test
+;
+@llvm_mips_ffint_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ffint_u_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_ffint_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ffint_u_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.ffint.u.w(<4 x i32> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_ffint_u_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.ffint.u.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_ffint_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ffint_u_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ffint_u.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ffint_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ffint_u_w_test
+;
+@llvm_mips_ffint_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ffint_u_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_ffint_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ffint_u_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.ffint.u.d(<2 x i64> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_ffint_u_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.ffint.u.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_ffint_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ffint_u_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ffint_u.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ffint_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ffint_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf_fq.ll b/test/CodeGen/Mips/msa/2rf_fq.ll
new file mode 100644
index 0000000..021dd93
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf_fq.ll
@@ -0,0 +1,82 @@
+; Test the MSA fixed-point to floating point conversion intrinsics that are
+; encoded with the 2RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ffql_w_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ffql_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_ffql_w_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ffql_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.ffql.w(<8 x i16> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_ffql_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.ffql.w(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_ffql_w_test:
+; CHECK: ld.h
+; CHECK: ffql.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ffql_w_test
+;
+@llvm_mips_ffql_d_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ffql_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_ffql_d_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ffql_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.ffql.d(<4 x i32> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_ffql_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.ffql.d(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_ffql_d_test:
+; CHECK: ld.w
+; CHECK: ffql.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ffql_d_test
+;
+@llvm_mips_ffqr_w_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ffqr_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_ffqr_w_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ffqr_w_ARG1
+  %1 = tail call <4 x float> @llvm.mips.ffqr.w(<8 x i16> %0)
+  store <4 x float> %1, <4 x float>* @llvm_mips_ffqr_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.ffqr.w(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_ffqr_w_test:
+; CHECK: ld.h
+; CHECK: ffqr.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ffqr_w_test
+;
+@llvm_mips_ffqr_d_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ffqr_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_ffqr_d_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ffqr_d_ARG1
+  %1 = tail call <2 x double> @llvm.mips.ffqr.d(<4 x i32> %0)
+  store <2 x double> %1, <2 x double>* @llvm_mips_ffqr_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.ffqr.d(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_ffqr_d_test:
+; CHECK: ld.w
+; CHECK: ffqr.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ffqr_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf_int_float.ll b/test/CodeGen/Mips/msa/2rf_int_float.ll
new file mode 100644
index 0000000..4665ae0
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf_int_float.ll
@@ -0,0 +1,217 @@
+; Test the MSA floating point to integer intrinsics that are encoded with the
+; 2RF instruction format. This includes conversions but other instructions such
+; as fclass are also here.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fclass_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fclass_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fclass_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fclass_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.fclass.w(<4 x float> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_fclass_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fclass.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_fclass_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fclass_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fclass.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_fclass_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_fclass_w_test
+;
+@llvm_mips_fclass_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fclass_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fclass_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fclass_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.fclass.d(<2 x double> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_fclass_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fclass.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_fclass_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_fclass_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: fclass.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_fclass_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_fclass_d_test
+;
+@llvm_mips_ftrunc_s_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_ftrunc_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ftrunc_s_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_ftrunc_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.ftrunc.s.w(<4 x float> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ftrunc_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ftrunc.s.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_ftrunc_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftrunc_s_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftrunc_s.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftrunc_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftrunc_s_w_test
+;
+@llvm_mips_ftrunc_s_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_ftrunc_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ftrunc_s_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_ftrunc_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.ftrunc.s.d(<2 x double> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ftrunc_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ftrunc.s.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_ftrunc_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftrunc_s_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftrunc_s.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftrunc_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftrunc_s_d_test
+;
+@llvm_mips_ftrunc_u_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_ftrunc_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ftrunc_u_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_ftrunc_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.ftrunc.u.w(<4 x float> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ftrunc_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ftrunc.u.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_ftrunc_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftrunc_u_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftrunc_u.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftrunc_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftrunc_u_w_test
+;
+@llvm_mips_ftrunc_u_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_ftrunc_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ftrunc_u_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_ftrunc_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.ftrunc.u.d(<2 x double> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ftrunc_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ftrunc.u.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_ftrunc_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftrunc_u_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftrunc_u.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftrunc_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftrunc_u_d_test
+;
+@llvm_mips_ftint_s_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_ftint_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ftint_s_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_ftint_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.ftint.s.w(<4 x float> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ftint_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ftint.s.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_ftint_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftint_s_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftint_s.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftint_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftint_s_w_test
+;
+@llvm_mips_ftint_s_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_ftint_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ftint_s_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_ftint_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.ftint.s.d(<2 x double> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ftint_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ftint.s.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_ftint_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftint_s_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftint_s.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftint_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftint_s_d_test
+;
+@llvm_mips_ftint_u_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_ftint_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ftint_u_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_ftint_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.ftint.u.w(<4 x float> %0)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ftint_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ftint.u.w(<4 x float>) nounwind
+
+; CHECK: llvm_mips_ftint_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftint_u_w_ARG1)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftint_u.w [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftint_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftint_u_w_test
+;
+@llvm_mips_ftint_u_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_ftint_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ftint_u_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_ftint_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.ftint.u.d(<2 x double> %0)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ftint_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ftint.u.d(<2 x double>) nounwind
+
+; CHECK: llvm_mips_ftint_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ftint_u_d_ARG1)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ftint_u.d [[WD:\$w[0-9]+]], [[WS]]
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ftint_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R2]])
+; CHECK: .size llvm_mips_ftint_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/2rf_tq.ll b/test/CodeGen/Mips/msa/2rf_tq.ll
new file mode 100644
index 0000000..6f3c508
--- /dev/null
+++ b/test/CodeGen/Mips/msa/2rf_tq.ll
@@ -0,0 +1,50 @@
+; Test the MSA floating-point to fixed-point conversion intrinsics that are
+; encoded with the 2RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ftq_h_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_ftq_h_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_ftq_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ftq_h_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_ftq_h_ARG1
+  %1 = load <4 x float>* @llvm_mips_ftq_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ftq.h(<4 x float> %0, <4 x float> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ftq_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ftq.h(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_ftq_h_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ftq.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ftq_h_test
+;
+@llvm_mips_ftq_w_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_ftq_w_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_ftq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ftq_w_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_ftq_w_ARG1
+  %1 = load <2 x double>* @llvm_mips_ftq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ftq.w(<2 x double> %0, <2 x double> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ftq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ftq.w(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_ftq_w_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ftq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ftq_w_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-a.ll b/test/CodeGen/Mips/msa/3r-a.ll
new file mode 100644
index 0000000..dab15b6
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-a.ll
@@ -0,0 +1,1191 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'a'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+; It should fail to compile without fp64.
+; RUN: not llc -march=mips -mattr=+msa < %s 2>&1 | \
+; RUN:    FileCheck -check-prefix=FP32ERROR %s
+; FP32ERROR: LLVM ERROR: MSA requires a 64-bit FPU register file (FR=1 mode).
+
+@llvm_mips_add_a_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_add_a_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_add_a_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_add_a_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_add_a_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_add_a_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.add.a.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_add_a_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.add.a.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_add_a_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_add_a_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_add_a_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: add_a.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_add_a_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_add_a_b_test
+;
+@llvm_mips_add_a_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_add_a_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_add_a_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_add_a_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_add_a_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_add_a_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.add.a.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_add_a_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.add.a.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_add_a_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_add_a_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_add_a_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: add_a.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_add_a_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_add_a_h_test
+;
+@llvm_mips_add_a_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_add_a_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_add_a_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_add_a_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_add_a_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_add_a_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.add.a.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_add_a_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.add.a.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_add_a_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_add_a_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_add_a_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: add_a.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_add_a_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_add_a_w_test
+;
+@llvm_mips_add_a_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_add_a_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_add_a_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_add_a_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_add_a_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_add_a_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.add.a.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_add_a_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.add.a.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_add_a_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_add_a_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_add_a_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: add_a.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_add_a_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_add_a_d_test
+;
+@llvm_mips_adds_a_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_adds_a_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_adds_a_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_adds_a_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_adds_a_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_adds_a_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.adds.a.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_adds_a_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.adds.a.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_adds_a_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_a_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_a_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_a.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_a_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_a_b_test
+;
+@llvm_mips_adds_a_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_adds_a_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_adds_a_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_adds_a_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_adds_a_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_adds_a_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.adds.a.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_adds_a_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.adds.a.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_adds_a_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_a_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_a_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_a.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_a_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_a_h_test
+;
+@llvm_mips_adds_a_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_adds_a_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_adds_a_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_adds_a_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_adds_a_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_adds_a_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.adds.a.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_adds_a_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.adds.a.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_adds_a_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_a_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_a_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_a.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_a_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_a_w_test
+;
+@llvm_mips_adds_a_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_adds_a_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_adds_a_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_adds_a_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_adds_a_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_adds_a_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.adds.a.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_adds_a_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.adds.a.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_adds_a_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_a_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_a_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_a.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_a_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_a_d_test
+;
+@llvm_mips_adds_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_adds_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_adds_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_adds_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_adds_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_adds_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.adds.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_adds_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.adds.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_adds_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_s_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_s_b_test
+;
+@llvm_mips_adds_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_adds_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_adds_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_adds_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_adds_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_adds_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.adds.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_adds_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.adds.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_adds_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_s_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_s_h_test
+;
+@llvm_mips_adds_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_adds_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_adds_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_adds_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_adds_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_adds_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.adds.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_adds_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.adds.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_adds_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_s_w_test
+;
+@llvm_mips_adds_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_adds_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_adds_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_adds_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_adds_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_adds_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.adds.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_adds_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.adds.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_adds_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_s_d_test
+;
+@llvm_mips_adds_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_adds_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_adds_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_adds_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_adds_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_adds_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.adds.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_adds_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.adds.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_adds_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_u_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_u_b_test
+;
+@llvm_mips_adds_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_adds_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_adds_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_adds_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_adds_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_adds_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.adds.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_adds_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.adds.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_adds_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_u_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_u_h_test
+;
+@llvm_mips_adds_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_adds_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_adds_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_adds_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_adds_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_adds_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.adds.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_adds_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.adds.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_adds_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_u_w_test
+;
+@llvm_mips_adds_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_adds_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_adds_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_adds_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_adds_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_adds_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.adds.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_adds_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.adds.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_adds_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_adds_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_adds_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: adds_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_adds_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_adds_u_d_test
+;
+@llvm_mips_addv_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_addv_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_addv_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_addv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_addv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_addv_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_addv_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.addv.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_addv_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_addv_b_test
+;
+@llvm_mips_addv_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_addv_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_addv_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_addv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_addv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_addv_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_addv_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.addv.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_addv_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_addv_h_test
+;
+@llvm_mips_addv_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_addv_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_addv_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_addv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_addv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_addv_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_addv_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.addv.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_addv_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_addv_w_test
+;
+@llvm_mips_addv_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_addv_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_addv_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_addv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_addv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_addv_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_addv_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.addv.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_addv_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_addv_d_test
+;
+
+define void @addv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_addv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_addv_b_ARG2
+  %2 = add <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_addv_b_RES
+  ret void
+}
+
+; CHECK: addv_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size addv_b_test
+;
+
+define void @addv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_addv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_addv_h_ARG2
+  %2 = add <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_addv_h_RES
+  ret void
+}
+
+; CHECK: addv_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size addv_h_test
+;
+
+define void @addv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_addv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_addv_w_ARG2
+  %2 = add <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_addv_w_RES
+  ret void
+}
+
+; CHECK: addv_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size addv_w_test
+;
+
+define void @addv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_addv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_addv_d_ARG2
+  %2 = add <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_addv_d_RES
+  ret void
+}
+
+; CHECK: addv_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_addv_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_addv_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: addv.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_addv_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size addv_d_test
+;
+@llvm_mips_asub_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_asub_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_asub_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_asub_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_asub_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_asub_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.asub.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_asub_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.asub.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_asub_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_s_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_s_b_test
+;
+@llvm_mips_asub_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_asub_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_asub_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_asub_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_asub_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_asub_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.asub.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_asub_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.asub.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_asub_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_s_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_s_h_test
+;
+@llvm_mips_asub_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_asub_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_asub_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_asub_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_asub_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_asub_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.asub.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_asub_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.asub.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_asub_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_s_w_test
+;
+@llvm_mips_asub_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_asub_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_asub_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_asub_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_asub_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_asub_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.asub.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_asub_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.asub.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_asub_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_s_d_test
+;
+@llvm_mips_asub_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_asub_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_asub_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_asub_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_asub_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_asub_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.asub.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_asub_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.asub.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_asub_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_u_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_u_b_test
+;
+@llvm_mips_asub_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_asub_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_asub_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_asub_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_asub_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_asub_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.asub.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_asub_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.asub.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_asub_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_u_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_u_h_test
+;
+@llvm_mips_asub_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_asub_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_asub_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_asub_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_asub_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_asub_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.asub.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_asub_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.asub.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_asub_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_u_w_test
+;
+@llvm_mips_asub_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_asub_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_asub_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_asub_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_asub_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_asub_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.asub.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_asub_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.asub.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_asub_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_asub_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_asub_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: asub_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_asub_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_asub_u_d_test
+;
+@llvm_mips_ave_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ave_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ave_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ave_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ave_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ave_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ave.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ave_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ave.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ave_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_s_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_s_b_test
+;
+@llvm_mips_ave_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ave_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ave_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ave_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ave_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ave_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ave.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ave_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ave.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ave_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_s_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_s_h_test
+;
+@llvm_mips_ave_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ave_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ave_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ave_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ave_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ave_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ave.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ave_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ave.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ave_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_s_w_test
+;
+@llvm_mips_ave_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ave_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ave_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ave_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ave_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ave_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ave.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ave_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ave.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ave_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_s_d_test
+;
+@llvm_mips_ave_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ave_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ave_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ave_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ave_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ave_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ave.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ave_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ave.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ave_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_u_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_u_b_test
+;
+@llvm_mips_ave_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ave_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ave_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ave_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ave_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ave_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ave.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ave_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ave.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ave_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_u_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_u_h_test
+;
+@llvm_mips_ave_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ave_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ave_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ave_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ave_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ave_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ave.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ave_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ave.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ave_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_u_w_test
+;
+@llvm_mips_ave_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ave_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ave_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ave_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ave_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ave_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ave.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ave_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ave.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ave_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_ave_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_ave_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ave_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_ave_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_ave_u_d_test
+;
+@llvm_mips_aver_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_aver_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_aver_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_aver_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_aver_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_aver_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.aver.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_aver_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.aver.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_aver_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_s_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_s_b_test
+;
+@llvm_mips_aver_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_aver_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_aver_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_aver_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_aver_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_aver_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.aver.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_aver_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.aver.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_aver_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_s_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_s_h_test
+;
+@llvm_mips_aver_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_aver_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_aver_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_aver_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_aver_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_aver_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.aver.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_aver_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.aver.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_aver_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_s_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_s_w_test
+;
+@llvm_mips_aver_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_aver_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_aver_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_aver_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_aver_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_aver_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.aver.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_aver_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.aver.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_aver_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_s_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_s_d_test
+;
+@llvm_mips_aver_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_aver_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_aver_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_aver_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_aver_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_aver_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.aver.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_aver_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.aver.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_aver_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_u_b_RES)
+; CHECK-DAG: st.b [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_u_b_test
+;
+@llvm_mips_aver_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_aver_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_aver_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_aver_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_aver_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_aver_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.aver.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_aver_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.aver.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_aver_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_u_h_RES)
+; CHECK-DAG: st.h [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_u_h_test
+;
+@llvm_mips_aver_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_aver_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_aver_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_aver_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_aver_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_aver_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.aver.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_aver_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.aver.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_aver_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_u_w_RES)
+; CHECK-DAG: st.w [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_u_w_test
+;
+@llvm_mips_aver_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_aver_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_aver_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_aver_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_aver_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_aver_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.aver.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_aver_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.aver.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_aver_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_aver_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_aver_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: aver_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_aver_u_d_RES)
+; CHECK-DAG: st.d [[WD]], 0([[R3]])
+; CHECK: .size llvm_mips_aver_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-b.ll b/test/CodeGen/Mips/msa/3r-b.ll
new file mode 100644
index 0000000..a05d19b
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-b.ll
@@ -0,0 +1,494 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'b'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_bclr_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bclr_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bclr_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bclr_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bclr_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bclr_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bclr.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bclr_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bclr.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_bclr_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: bclr.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_bclr_b_test
+;
+@llvm_mips_bclr_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bclr_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bclr_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bclr_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bclr_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bclr_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.bclr.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_bclr_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bclr.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_bclr_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: bclr.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bclr_h_test
+;
+@llvm_mips_bclr_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bclr_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bclr_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bclr_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bclr_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bclr_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.bclr.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_bclr_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bclr.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_bclr_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: bclr.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bclr_w_test
+;
+@llvm_mips_bclr_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bclr_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bclr_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bclr_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bclr_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bclr_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.bclr.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_bclr_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bclr.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_bclr_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: bclr.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bclr_d_test
+
+@llvm_mips_binsl_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsl_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsl_b_ARG3 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_binsl_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_binsl_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_binsl_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_binsl_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_binsl_b_ARG3
+  %3 = tail call <16 x i8> @llvm.mips.binsl.b(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_binsl_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.binsl.b(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_binsl_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsl_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsl_b_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsl_b_ARG3)(
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsl.b [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.b [[R4]], 0(
+; CHECK: .size llvm_mips_binsl_b_test
+
+@llvm_mips_binsl_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsl_h_ARG2 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsl_h_ARG3 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_binsl_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_binsl_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_binsl_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_binsl_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_binsl_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.binsl.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_binsl_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.binsl.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_binsl_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsl_h_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsl_h_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsl_h_ARG3)(
+; CHECK-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.h [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsl.h [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.h [[R4]], 0(
+; CHECK: .size llvm_mips_binsl_h_test
+
+@llvm_mips_binsl_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsl_w_ARG2 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsl_w_ARG3 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_binsl_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_binsl_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_binsl_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_binsl_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_binsl_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.binsl.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_binsl_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.binsl.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_binsl_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsl_w_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsl_w_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsl_w_ARG3)(
+; CHECK-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.w [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsl.w [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.w [[R4]], 0(
+; CHECK: .size llvm_mips_binsl_w_test
+
+@llvm_mips_binsl_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsl_d_ARG2 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsl_d_ARG3 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_binsl_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_binsl_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_binsl_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_binsl_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_binsl_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.binsl.d(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_binsl_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.binsl.d(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_binsl_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsl_d_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsl_d_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsl_d_ARG3)(
+; CHECK-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.d [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsl.d [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.d [[R4]], 0(
+; CHECK: .size llvm_mips_binsl_d_test
+
+@llvm_mips_binsr_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsr_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsr_b_ARG3 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_binsr_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_binsr_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_binsr_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_binsr_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_binsr_b_ARG3
+  %3 = tail call <16 x i8> @llvm.mips.binsr.b(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_binsr_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.binsr.b(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_binsr_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsr_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsr_b_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsr_b_ARG3)(
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsr.b [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.b [[R4]], 0(
+; CHECK: .size llvm_mips_binsr_b_test
+
+@llvm_mips_binsr_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsr_h_ARG2 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsr_h_ARG3 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_binsr_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_binsr_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_binsr_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_binsr_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_binsr_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.binsr.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_binsr_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.binsr.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_binsr_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsr_h_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsr_h_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsr_h_ARG3)(
+; CHECK-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.h [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsr.h [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.h [[R4]], 0(
+; CHECK: .size llvm_mips_binsr_h_test
+
+@llvm_mips_binsr_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsr_w_ARG2 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsr_w_ARG3 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_binsr_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_binsr_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_binsr_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_binsr_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_binsr_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.binsr.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_binsr_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.binsr.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_binsr_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsr_w_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsr_w_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsr_w_ARG3)(
+; CHECK-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.w [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsr.w [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.w [[R4]], 0(
+; CHECK: .size llvm_mips_binsr_w_test
+
+@llvm_mips_binsr_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsr_d_ARG2 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsr_d_ARG3 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_binsr_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_binsr_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_binsr_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_binsr_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_binsr_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.binsr.d(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_binsr_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.binsr.d(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_binsr_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsr_d_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsr_d_ARG2)(
+; CHECK-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_binsr_d_ARG3)(
+; CHECK-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[R5:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: ld.d [[R6:\$w[0-9]+]], 0([[R3]])
+; CHECK-DAG: binsr.d [[R4]], [[R5]], [[R6]]
+; CHECK-DAG: st.d [[R4]], 0(
+; CHECK: .size llvm_mips_binsr_d_test
+
+@llvm_mips_bneg_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bneg_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bneg_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bneg_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bneg_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bneg_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bneg.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bneg_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bneg.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_bneg_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: bneg.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_bneg_b_test
+;
+@llvm_mips_bneg_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bneg_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bneg_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bneg_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bneg_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bneg_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.bneg.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_bneg_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bneg.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_bneg_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: bneg.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bneg_h_test
+;
+@llvm_mips_bneg_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bneg_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bneg_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bneg_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bneg_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bneg_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.bneg.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_bneg_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bneg.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_bneg_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: bneg.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bneg_w_test
+;
+@llvm_mips_bneg_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bneg_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bneg_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bneg_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bneg_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bneg_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.bneg.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_bneg_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bneg.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_bneg_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: bneg.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bneg_d_test
+;
+@llvm_mips_bset_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bset_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bset_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bset_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bset_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bset_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bset.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bset_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bset.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_bset_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: bset.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_bset_b_test
+;
+@llvm_mips_bset_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bset_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bset_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bset_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bset_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bset_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.bset.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_bset_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bset.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_bset_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: bset.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bset_h_test
+;
+@llvm_mips_bset_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bset_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bset_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bset_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bset_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bset_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.bset.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_bset_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bset.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_bset_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: bset.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bset_w_test
+;
+@llvm_mips_bset_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bset_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bset_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bset_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bset_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bset_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.bset.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_bset_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bset.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_bset_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: bset.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bset_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-c.ll b/test/CodeGen/Mips/msa/3r-c.ll
new file mode 100644
index 0000000..6ec92c2
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-c.ll
@@ -0,0 +1,446 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'c'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ceq_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ceq_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ceq_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ceq_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ceq_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ceq_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ceq.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ceq_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ceq.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ceq_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ceq.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ceq_b_test
+;
+@llvm_mips_ceq_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ceq_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ceq_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ceq_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ceq_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ceq_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ceq.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ceq_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ceq.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ceq_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ceq.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ceq_h_test
+;
+@llvm_mips_ceq_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ceq_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ceq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ceq_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ceq_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ceq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ceq.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ceq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ceq.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ceq_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ceq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ceq_w_test
+;
+@llvm_mips_ceq_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ceq_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ceq_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ceq_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ceq_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ceq_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ceq.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ceq_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ceq.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ceq_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ceq.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ceq_d_test
+;
+@llvm_mips_cle_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_cle_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_cle_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_cle_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_cle_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_cle_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.cle.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_cle_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.cle.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_cle_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: cle_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_cle_s_b_test
+;
+@llvm_mips_cle_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_cle_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_cle_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_cle_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_cle_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_cle_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.cle.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_cle_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.cle.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_cle_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: cle_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_cle_s_h_test
+;
+@llvm_mips_cle_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_cle_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_cle_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_cle_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_cle_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_cle_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.cle.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_cle_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.cle.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_cle_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: cle_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_cle_s_w_test
+;
+@llvm_mips_cle_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_cle_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_cle_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_cle_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_cle_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_cle_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.cle.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_cle_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.cle.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_cle_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: cle_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_cle_s_d_test
+;
+@llvm_mips_cle_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_cle_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_cle_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_cle_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_cle_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_cle_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.cle.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_cle_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.cle.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_cle_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: cle_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_cle_u_b_test
+;
+@llvm_mips_cle_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_cle_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_cle_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_cle_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_cle_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_cle_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.cle.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_cle_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.cle.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_cle_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: cle_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_cle_u_h_test
+;
+@llvm_mips_cle_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_cle_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_cle_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_cle_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_cle_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_cle_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.cle.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_cle_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.cle.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_cle_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: cle_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_cle_u_w_test
+;
+@llvm_mips_cle_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_cle_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_cle_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_cle_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_cle_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_cle_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.cle.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_cle_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.cle.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_cle_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: cle_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_cle_u_d_test
+;
+@llvm_mips_clt_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clt_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_clt_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clt_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clt_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_clt_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.clt.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_clt_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clt.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_clt_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: clt_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clt_s_b_test
+;
+@llvm_mips_clt_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clt_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_clt_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clt_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clt_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_clt_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.clt.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_clt_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clt.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_clt_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: clt_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clt_s_h_test
+;
+@llvm_mips_clt_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clt_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_clt_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clt_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clt_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_clt_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.clt.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_clt_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clt.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_clt_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: clt_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clt_s_w_test
+;
+@llvm_mips_clt_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clt_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_clt_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clt_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clt_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_clt_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.clt.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_clt_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clt.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_clt_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: clt_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clt_s_d_test
+;
+@llvm_mips_clt_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clt_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_clt_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clt_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clt_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_clt_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.clt.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_clt_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clt.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_clt_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: clt_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clt_u_b_test
+;
+@llvm_mips_clt_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clt_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_clt_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clt_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clt_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_clt_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.clt.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_clt_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clt.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_clt_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: clt_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clt_u_h_test
+;
+@llvm_mips_clt_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clt_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_clt_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clt_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clt_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_clt_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.clt.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_clt_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clt.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_clt_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: clt_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clt_u_w_test
+;
+@llvm_mips_clt_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clt_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_clt_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clt_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clt_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_clt_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.clt.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_clt_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clt.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_clt_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: clt_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clt_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-d.ll b/test/CodeGen/Mips/msa/3r-d.ll
new file mode 100644
index 0000000..0099554
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-d.ll
@@ -0,0 +1,478 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'd'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_div_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_div_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_div_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_div_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_div_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_div_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.div.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_div_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.div.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_div_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: div_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_div_s_b_test
+;
+@llvm_mips_div_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_div_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_div_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_div_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_div_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_div_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.div.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_div_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.div.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_div_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: div_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_div_s_h_test
+;
+@llvm_mips_div_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_div_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_div_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_div_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_div_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_div_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.div.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_div_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.div.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_div_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: div_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_div_s_w_test
+;
+@llvm_mips_div_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_div_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_div_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_div_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_div_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_div_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.div.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_div_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.div.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_div_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: div_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_div_s_d_test
+;
+
+define void @div_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_div_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_div_s_b_ARG2
+  %2 = sdiv <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_div_s_b_RES
+  ret void
+}
+
+; CHECK: div_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: div_s.b
+; CHECK: st.b
+; CHECK: .size div_s_b_test
+
+define void @div_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_div_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_div_s_h_ARG2
+  %2 = sdiv <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_div_s_h_RES
+  ret void
+}
+
+; CHECK: div_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: div_s.h
+; CHECK: st.h
+; CHECK: .size div_s_h_test
+
+define void @div_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_div_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_div_s_w_ARG2
+  %2 = sdiv <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_div_s_w_RES
+  ret void
+}
+
+; CHECK: div_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: div_s.w
+; CHECK: st.w
+; CHECK: .size div_s_w_test
+
+define void @div_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_div_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_div_s_d_ARG2
+  %2 = sdiv <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_div_s_d_RES
+  ret void
+}
+
+; CHECK: div_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: div_s.d
+; CHECK: st.d
+; CHECK: .size div_s_d_test
+;
+@llvm_mips_div_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_div_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_div_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_div_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_div_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_div_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.div.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_div_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.div.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_div_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: div_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_div_u_b_test
+;
+@llvm_mips_div_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_div_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_div_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_div_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_div_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_div_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.div.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_div_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.div.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_div_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: div_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_div_u_h_test
+;
+@llvm_mips_div_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_div_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_div_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_div_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_div_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_div_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.div.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_div_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.div.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_div_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: div_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_div_u_w_test
+;
+@llvm_mips_div_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_div_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_div_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_div_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_div_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_div_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.div.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_div_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.div.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_div_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: div_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_div_u_d_test
+;
+
+define void @div_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_div_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_div_u_b_ARG2
+  %2 = udiv <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_div_u_b_RES
+  ret void
+}
+
+; CHECK: div_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: div_u.b
+; CHECK: st.b
+; CHECK: .size div_u_b_test
+
+define void @div_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_div_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_div_u_h_ARG2
+  %2 = udiv <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_div_u_h_RES
+  ret void
+}
+
+; CHECK: div_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: div_u.h
+; CHECK: st.h
+; CHECK: .size div_u_h_test
+
+define void @div_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_div_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_div_u_w_ARG2
+  %2 = udiv <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_div_u_w_RES
+  ret void
+}
+
+; CHECK: div_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: div_u.w
+; CHECK: st.w
+; CHECK: .size div_u_w_test
+
+define void @div_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_div_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_div_u_d_ARG2
+  %2 = udiv <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_div_u_d_RES
+  ret void
+}
+
+; CHECK: div_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: div_u.d
+; CHECK: st.d
+; CHECK: .size div_u_d_test
+;
+@llvm_mips_dotp_s_h_ARG1 = global <16 x i8> <i8  0, i8  1, i8  2, i8  3,
+                                             i8  4, i8  5, i8  6, i8  7,
+                                             i8  8, i8  9, i8 10, i8 11,
+                                             i8 12, i8 13, i8 14, i8 15>,
+                                            align 16
+@llvm_mips_dotp_s_h_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19,
+                                             i8 20, i8 21, i8 22, i8 23,
+                                             i8 24, i8 25, i8 26, i8 27,
+                                             i8 28, i8 29, i8 30, i8 31>,
+                                            align 16
+@llvm_mips_dotp_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0,
+                                             i16 0, i16 0, i16 0, i16 0>,
+                                            align 16
+
+define void @llvm_mips_dotp_s_h_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_dotp_s_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dotp_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.dotp.s.h(<16 x i8> %0, <16 x i8> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_dotp_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dotp.s.h(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dotp_s_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: dotp_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dotp_s_h_test
+;
+@llvm_mips_dotp_s_w_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3,
+                                             i16 4, i16 5, i16 6, i16 7>,
+                                            align 16
+@llvm_mips_dotp_s_w_ARG2 = global <8 x i16> <i16  4, i16  5, i16  6, i16  7,
+                                             i16  8, i16  9, i16 10, i16 11>,
+                                            align 16
+@llvm_mips_dotp_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                            align 16
+
+define void @llvm_mips_dotp_s_w_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dotp_s_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dotp_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.dotp.s.w(<8 x i16> %0, <8 x i16> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_dotp_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dotp.s.w(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dotp_s_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: dotp_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dotp_s_w_test
+;
+@llvm_mips_dotp_s_d_ARG1 = global <4 x i32> <i32 0, i32 1, i32 0, i32 1>,
+                                            align 16
+@llvm_mips_dotp_s_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 2, i32 3>,
+                                            align 16
+@llvm_mips_dotp_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dotp_s_d_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dotp_s_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dotp_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.dotp.s.d(<4 x i32> %0, <4 x i32> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_dotp_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dotp.s.d(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dotp_s_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: dotp_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dotp_s_d_test
+;
+@llvm_mips_dotp_u_h_ARG1 = global <16 x i8> <i8  0, i8  1, i8  2, i8  3,
+                                             i8  4, i8  5, i8  6, i8  7,
+                                             i8  8, i8  9, i8 10, i8 11,
+                                             i8 12, i8 13, i8 14, i8 15>,
+                                            align 16
+@llvm_mips_dotp_u_h_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19,
+                                             i8 20, i8 21, i8 22, i8 23,
+                                             i8 24, i8 25, i8 26, i8 27,
+                                             i8 28, i8 29, i8 30, i8 31>,
+                                            align 16
+@llvm_mips_dotp_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0,
+                                             i16 0, i16 0, i16 0, i16 0>,
+                                            align 16
+
+define void @llvm_mips_dotp_u_h_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_dotp_u_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dotp_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.dotp.u.h(<16 x i8> %0, <16 x i8> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_dotp_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dotp.u.h(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dotp_u_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: dotp_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dotp_u_h_test
+;
+@llvm_mips_dotp_u_w_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3,
+                                             i16 4, i16 5, i16 6, i16 7>,
+                                            align 16
+@llvm_mips_dotp_u_w_ARG2 = global <8 x i16> <i16  4, i16  5, i16  6, i16  7,
+                                             i16  8, i16  9, i16 10, i16 11>,
+                                            align 16
+@llvm_mips_dotp_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>,
+                                            align 16
+
+define void @llvm_mips_dotp_u_w_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dotp_u_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dotp_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.dotp.u.w(<8 x i16> %0, <8 x i16> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_dotp_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dotp.u.w(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dotp_u_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: dotp_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dotp_u_w_test
+;
+@llvm_mips_dotp_u_d_ARG1 = global <4 x i32> <i32 0, i32 1, i32 0, i32 1>,
+                                            align 16
+@llvm_mips_dotp_u_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 2, i32 3>,
+                                            align 16
+@llvm_mips_dotp_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dotp_u_d_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dotp_u_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dotp_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.dotp.u.d(<4 x i32> %0, <4 x i32> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_dotp_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dotp.u.d(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dotp_u_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: dotp_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dotp_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-i.ll b/test/CodeGen/Mips/msa/3r-i.ll
new file mode 100644
index 0000000..2ef3047
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-i.ll
@@ -0,0 +1,358 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'i'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ilvev_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ilvev_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ilvev_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ilvev_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ilvev_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ilvev_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ilvev.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ilvev_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ilvev.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ilvev_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ilvev.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ilvev_b_test
+;
+@llvm_mips_ilvev_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ilvev_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ilvev_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ilvev_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ilvev_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ilvev_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ilvev.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ilvev_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ilvev.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ilvev_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ilvev.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ilvev_h_test
+;
+@llvm_mips_ilvev_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ilvev_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ilvev_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ilvev_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ilvev_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ilvev_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ilvev.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ilvev_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ilvev.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ilvev_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ilvev.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ilvev_w_test
+;
+@llvm_mips_ilvev_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ilvev_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ilvev_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ilvev_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ilvev_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ilvev_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ilvev.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ilvev_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ilvev.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ilvev_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ilvev.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ilvev_d_test
+;
+@llvm_mips_ilvl_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ilvl_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ilvl_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ilvl_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ilvl_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ilvl_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ilvl.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ilvl_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ilvl.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ilvl_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ilvl.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ilvl_b_test
+;
+@llvm_mips_ilvl_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ilvl_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ilvl_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ilvl_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ilvl_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ilvl_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ilvl.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ilvl_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ilvl.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ilvl_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ilvl.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ilvl_h_test
+;
+@llvm_mips_ilvl_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ilvl_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ilvl_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ilvl_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ilvl_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ilvl_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ilvl.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ilvl_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ilvl.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ilvl_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ilvl.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ilvl_w_test
+;
+@llvm_mips_ilvl_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ilvl_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ilvl_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ilvl_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ilvl_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ilvl_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ilvl.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ilvl_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ilvl.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ilvl_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ilvl.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ilvl_d_test
+;
+@llvm_mips_ilvod_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ilvod_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ilvod_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ilvod_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ilvod_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ilvod_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ilvod.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ilvod_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ilvod.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ilvod_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ilvod.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ilvod_b_test
+;
+@llvm_mips_ilvod_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ilvod_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ilvod_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ilvod_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ilvod_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ilvod_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ilvod.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ilvod_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ilvod.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ilvod_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ilvod.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ilvod_h_test
+;
+@llvm_mips_ilvod_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ilvod_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ilvod_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ilvod_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ilvod_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ilvod_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ilvod.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ilvod_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ilvod.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ilvod_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ilvod.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ilvod_w_test
+;
+@llvm_mips_ilvod_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ilvod_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ilvod_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ilvod_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ilvod_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ilvod_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ilvod.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ilvod_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ilvod.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ilvod_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ilvod.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ilvod_d_test
+;
+@llvm_mips_ilvr_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ilvr_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_ilvr_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ilvr_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ilvr_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_ilvr_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.ilvr.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_ilvr_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ilvr.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_ilvr_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ilvr.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ilvr_b_test
+;
+@llvm_mips_ilvr_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ilvr_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_ilvr_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ilvr_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ilvr_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_ilvr_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.ilvr.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_ilvr_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ilvr.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_ilvr_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ilvr.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ilvr_h_test
+;
+@llvm_mips_ilvr_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ilvr_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_ilvr_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ilvr_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ilvr_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_ilvr_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.ilvr.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_ilvr_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ilvr.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_ilvr_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ilvr.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ilvr_w_test
+;
+@llvm_mips_ilvr_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ilvr_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_ilvr_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ilvr_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ilvr_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_ilvr_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.ilvr.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_ilvr_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ilvr.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_ilvr_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ilvr.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ilvr_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-m.ll b/test/CodeGen/Mips/msa/3r-m.ll
new file mode 100644
index 0000000..ddfd720
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-m.ll
@@ -0,0 +1,862 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'm'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_max_a_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_max_a_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_max_a_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_max_a_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_max_a_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_max_a_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.max.a.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_max_a_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.max.a.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_max_a_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: max_a.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_max_a_b_test
+;
+@llvm_mips_max_a_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_max_a_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_max_a_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_max_a_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_max_a_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_max_a_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.max.a.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_max_a_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.max.a.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_max_a_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: max_a.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_max_a_h_test
+;
+@llvm_mips_max_a_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_max_a_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_max_a_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_max_a_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_max_a_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_max_a_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.max.a.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_max_a_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.max.a.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_max_a_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: max_a.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_max_a_w_test
+;
+@llvm_mips_max_a_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_max_a_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_max_a_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_max_a_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_max_a_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_max_a_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.max.a.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_max_a_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.max.a.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_max_a_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: max_a.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_max_a_d_test
+;
+@llvm_mips_max_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_max_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_max_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_max_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_max_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_max_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.max.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_max_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.max.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_max_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: max_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_max_s_b_test
+;
+@llvm_mips_max_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_max_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_max_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_max_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_max_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_max_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.max.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_max_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.max.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_max_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: max_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_max_s_h_test
+;
+@llvm_mips_max_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_max_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_max_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_max_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_max_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_max_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.max.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_max_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.max.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_max_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: max_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_max_s_w_test
+;
+@llvm_mips_max_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_max_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_max_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_max_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_max_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_max_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.max.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_max_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.max.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_max_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: max_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_max_s_d_test
+;
+@llvm_mips_max_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_max_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_max_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_max_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_max_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_max_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.max.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_max_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.max.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_max_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: max_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_max_u_b_test
+;
+@llvm_mips_max_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_max_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_max_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_max_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_max_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_max_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.max.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_max_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.max.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_max_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: max_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_max_u_h_test
+;
+@llvm_mips_max_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_max_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_max_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_max_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_max_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_max_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.max.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_max_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.max.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_max_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: max_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_max_u_w_test
+;
+@llvm_mips_max_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_max_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_max_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_max_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_max_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_max_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.max.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_max_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.max.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_max_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: max_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_max_u_d_test
+;
+@llvm_mips_min_a_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_min_a_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_min_a_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_min_a_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_min_a_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_min_a_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.min.a.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_min_a_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.min.a.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_min_a_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: min_a.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_min_a_b_test
+;
+@llvm_mips_min_a_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_min_a_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_min_a_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_min_a_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_min_a_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_min_a_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.min.a.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_min_a_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.min.a.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_min_a_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: min_a.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_min_a_h_test
+;
+@llvm_mips_min_a_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_min_a_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_min_a_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_min_a_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_min_a_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_min_a_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.min.a.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_min_a_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.min.a.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_min_a_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: min_a.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_min_a_w_test
+;
+@llvm_mips_min_a_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_min_a_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_min_a_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_min_a_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_min_a_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_min_a_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.min.a.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_min_a_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.min.a.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_min_a_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: min_a.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_min_a_d_test
+;
+@llvm_mips_min_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_min_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_min_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_min_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_min_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_min_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.min.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_min_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.min.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_min_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: min_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_min_s_b_test
+;
+@llvm_mips_min_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_min_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_min_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_min_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_min_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_min_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.min.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_min_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.min.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_min_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: min_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_min_s_h_test
+;
+@llvm_mips_min_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_min_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_min_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_min_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_min_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_min_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.min.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_min_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.min.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_min_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: min_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_min_s_w_test
+;
+@llvm_mips_min_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_min_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_min_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_min_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_min_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_min_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.min.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_min_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.min.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_min_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: min_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_min_s_d_test
+;
+@llvm_mips_min_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_min_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_min_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_min_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_min_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_min_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.min.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_min_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.min.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_min_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: min_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_min_u_b_test
+;
+@llvm_mips_min_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_min_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_min_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_min_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_min_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_min_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.min.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_min_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.min.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_min_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: min_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_min_u_h_test
+;
+@llvm_mips_min_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_min_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_min_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_min_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_min_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_min_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.min.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_min_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.min.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_min_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: min_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_min_u_w_test
+;
+@llvm_mips_min_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_min_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_min_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_min_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_min_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_min_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.min.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_min_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.min.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_min_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: min_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_min_u_d_test
+;
+@llvm_mips_mod_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_mod_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_mod_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_mod_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mod_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_mod_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.mod.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_mod_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.mod.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_mod_s_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: mod_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_mod_s_b_test
+;
+@llvm_mips_mod_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mod_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_mod_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mod_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mod_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mod_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.mod.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mod_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mod.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_mod_s_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mod_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mod_s_h_test
+;
+@llvm_mips_mod_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mod_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_mod_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mod_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mod_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mod_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.mod.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mod_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mod.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_mod_s_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mod_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mod_s_w_test
+;
+@llvm_mips_mod_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_mod_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_mod_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_mod_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mod_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_mod_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.mod.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_mod_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.mod.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_mod_s_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: mod_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_mod_s_d_test
+;
+@llvm_mips_mod_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_mod_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_mod_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_mod_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mod_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_mod_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.mod.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_mod_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.mod.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_mod_u_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: mod_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_mod_u_b_test
+;
+@llvm_mips_mod_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mod_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_mod_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mod_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mod_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mod_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.mod.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mod_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mod.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_mod_u_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mod_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mod_u_h_test
+;
+@llvm_mips_mod_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mod_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_mod_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mod_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mod_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mod_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.mod.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mod_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mod.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_mod_u_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mod_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mod_u_w_test
+;
+@llvm_mips_mod_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_mod_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_mod_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_mod_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mod_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_mod_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.mod.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_mod_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.mod.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_mod_u_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: mod_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_mod_u_d_test
+;
+@llvm_mips_mulv_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_mulv_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_mulv_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_mulv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mulv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_mulv_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.mulv.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_mulv_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.mulv.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_mulv_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: mulv.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_mulv_b_test
+;
+@llvm_mips_mulv_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mulv_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_mulv_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mulv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mulv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mulv_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.mulv.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mulv_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mulv.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_mulv_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mulv.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mulv_h_test
+;
+@llvm_mips_mulv_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mulv_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_mulv_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mulv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mulv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mulv_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.mulv.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mulv_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mulv.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_mulv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mulv.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mulv_w_test
+;
+@llvm_mips_mulv_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_mulv_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_mulv_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_mulv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mulv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_mulv_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.mulv.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_mulv_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.mulv.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_mulv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: mulv.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_mulv_d_test
+
+define void @mulv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mulv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_mulv_b_ARG2
+  %2 = mul <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_mulv_b_RES
+  ret void
+}
+
+; CHECK: mulv_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: mulv.b
+; CHECK: st.b
+; CHECK: .size mulv_b_test
+
+define void @mulv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mulv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mulv_h_ARG2
+  %2 = mul <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mulv_h_RES
+  ret void
+}
+
+; CHECK: mulv_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mulv.h
+; CHECK: st.h
+; CHECK: .size mulv_h_test
+
+define void @mulv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mulv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mulv_w_ARG2
+  %2 = mul <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mulv_w_RES
+  ret void
+}
+
+; CHECK: mulv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mulv.w
+; CHECK: st.w
+; CHECK: .size mulv_w_test
+
+define void @mulv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mulv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_mulv_d_ARG2
+  %2 = mul <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_mulv_d_RES
+  ret void
+}
+
+; CHECK: mulv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: mulv.d
+; CHECK: st.d
+; CHECK: .size mulv_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-p.ll b/test/CodeGen/Mips/msa/3r-p.ll
new file mode 100644
index 0000000..852023b
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-p.ll
@@ -0,0 +1,182 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'p'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_pckev_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_pckev_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_pckev_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_pckev_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_pckev_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_pckev_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.pckev.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_pckev_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.pckev.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_pckev_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: pckev.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_pckev_b_test
+;
+@llvm_mips_pckev_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_pckev_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_pckev_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_pckev_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_pckev_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_pckev_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.pckev.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_pckev_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.pckev.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_pckev_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: pckev.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_pckev_h_test
+;
+@llvm_mips_pckev_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_pckev_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_pckev_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_pckev_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_pckev_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_pckev_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.pckev.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_pckev_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.pckev.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_pckev_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: pckev.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_pckev_w_test
+;
+@llvm_mips_pckev_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_pckev_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_pckev_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_pckev_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_pckev_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_pckev_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.pckev.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_pckev_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.pckev.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_pckev_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: pckev.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_pckev_d_test
+;
+@llvm_mips_pckod_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_pckod_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_pckod_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_pckod_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_pckod_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_pckod_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.pckod.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_pckod_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.pckod.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_pckod_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: pckod.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_pckod_b_test
+;
+@llvm_mips_pckod_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_pckod_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_pckod_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_pckod_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_pckod_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_pckod_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.pckod.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_pckod_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.pckod.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_pckod_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: pckod.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_pckod_h_test
+;
+@llvm_mips_pckod_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_pckod_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_pckod_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_pckod_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_pckod_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_pckod_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.pckod.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_pckod_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.pckod.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_pckod_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: pckod.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_pckod_w_test
+;
+@llvm_mips_pckod_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_pckod_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_pckod_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_pckod_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_pckod_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_pckod_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.pckod.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_pckod_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.pckod.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_pckod_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: pckod.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_pckod_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-s.ll b/test/CodeGen/Mips/msa/3r-s.ll
new file mode 100644
index 0000000..30cf265
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-s.ll
@@ -0,0 +1,1353 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 's'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_sld_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sld_b_ARG2 = global i32 10, align 16
+@llvm_mips_sld_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sld_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sld_b_ARG1
+  %1 = load i32* @llvm_mips_sld_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.sld.b(<16 x i8> %0, i32 %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_sld_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sld.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_sld_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sld_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
+; CHECK-DAG: sld.b [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_sld_b_test
+;
+@llvm_mips_sld_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sld_h_ARG2 = global i32 10, align 16
+@llvm_mips_sld_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sld_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sld_h_ARG1
+  %1 = load i32* @llvm_mips_sld_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.sld.h(<8 x i16> %0, i32 %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_sld_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sld.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_sld_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_h_ARG1)
+; CHECK-DAG: lw [[RT:\$[0-9]+]], %got(llvm_mips_sld_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
+; CHECK-DAG: sld.h [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_sld_h_test
+;
+@llvm_mips_sld_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sld_w_ARG2 = global i32 10, align 16
+@llvm_mips_sld_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sld_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sld_w_ARG1
+  %1 = load i32* @llvm_mips_sld_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.sld.w(<4 x i32> %0, i32 %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_sld_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sld.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_sld_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_w_ARG1)
+; CHECK-DAG: lw [[RT:\$[0-9]+]], %got(llvm_mips_sld_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
+; CHECK-DAG: sld.w [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_sld_w_test
+;
+@llvm_mips_sld_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sld_d_ARG2 = global i32 10, align 16
+@llvm_mips_sld_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sld_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sld_d_ARG1
+  %1 = load i32* @llvm_mips_sld_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.sld.d(<2 x i64> %0, i32 %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_sld_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sld.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_sld_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sld_d_ARG1)
+; CHECK-DAG: lw [[RT:\$[0-9]+]], %got(llvm_mips_sld_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: lw [[RT:\$[0-9]+]], 0([[R2]])
+; CHECK-DAG: sld.d [[WD:\$w[0-9]+]], [[WS]]{{\[}}[[RT]]{{\]}}
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_sld_d_test
+;
+@llvm_mips_sll_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sll_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_sll_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sll_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sll_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_sll_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.sll.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_sll_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sll.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_sll_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_sll_b_test
+;
+@llvm_mips_sll_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sll_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_sll_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sll_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sll_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_sll_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.sll.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_sll_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sll.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_sll_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_sll_h_test
+;
+@llvm_mips_sll_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sll_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_sll_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sll_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sll_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_sll_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.sll.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_sll_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sll.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_sll_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_sll_w_test
+;
+@llvm_mips_sll_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sll_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_sll_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sll_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sll_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_sll_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.sll.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_sll_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sll.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_sll_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_sll_d_test
+
+define void @sll_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sll_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_sll_b_ARG2
+  %2 = shl <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_sll_b_RES
+  ret void
+}
+
+; CHECK: sll_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size sll_b_test
+
+define void @sll_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sll_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_sll_h_ARG2
+  %2 = shl <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_sll_h_RES
+  ret void
+}
+
+; CHECK: sll_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size sll_h_test
+
+define void @sll_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sll_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_sll_w_ARG2
+  %2 = shl <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_sll_w_RES
+  ret void
+}
+
+; CHECK: sll_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size sll_w_test
+
+define void @sll_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sll_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_sll_d_ARG2
+  %2 = shl <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_sll_d_RES
+  ret void
+}
+
+; CHECK: sll_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sll_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sll_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sll.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size sll_d_test
+;
+@llvm_mips_sra_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sra_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_sra_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sra_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sra_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_sra_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.sra.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_sra_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sra.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_sra_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_sra_b_test
+;
+@llvm_mips_sra_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sra_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_sra_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sra_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sra_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_sra_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.sra.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_sra_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sra.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_sra_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_sra_h_test
+;
+@llvm_mips_sra_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sra_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_sra_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sra_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sra_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_sra_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.sra.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_sra_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sra.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_sra_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_sra_w_test
+;
+@llvm_mips_sra_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sra_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_sra_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sra_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sra_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_sra_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.sra.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_sra_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sra.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_sra_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_sra_d_test
+;
+
+define void @sra_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sra_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_sra_b_ARG2
+  %2 = ashr <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_sra_b_RES
+  ret void
+}
+
+; CHECK: sra_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size sra_b_test
+
+define void @sra_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sra_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_sra_h_ARG2
+  %2 = ashr <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_sra_h_RES
+  ret void
+}
+
+; CHECK: sra_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size sra_h_test
+
+define void @sra_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sra_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_sra_w_ARG2
+  %2 = ashr <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_sra_w_RES
+  ret void
+}
+
+; CHECK: sra_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size sra_w_test
+
+define void @sra_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sra_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_sra_d_ARG2
+  %2 = ashr <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_sra_d_RES
+  ret void
+}
+
+; CHECK: sra_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_sra_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_sra_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: sra.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size sra_d_test
+
+@llvm_mips_srar_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srar_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_srar_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srar_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srar_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_srar_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.srar.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_srar_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srar.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_srar_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srar_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srar_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srar.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_srar_b_test
+;
+@llvm_mips_srar_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srar_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_srar_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srar_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srar_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_srar_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.srar.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_srar_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srar.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_srar_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srar_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srar_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srar.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_srar_h_test
+;
+@llvm_mips_srar_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srar_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_srar_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srar_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srar_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_srar_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.srar.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_srar_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srar.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_srar_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srar_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srar_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srar.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_srar_w_test
+;
+@llvm_mips_srar_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srar_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_srar_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srar_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srar_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_srar_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.srar.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_srar_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srar.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_srar_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srar_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srar_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srar.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_srar_d_test
+;
+@llvm_mips_srl_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srl_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_srl_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srl_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srl_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_srl_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.srl.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_srl_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srl.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_srl_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_srl_b_test
+;
+@llvm_mips_srl_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srl_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_srl_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srl_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srl_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_srl_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.srl.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_srl_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srl.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_srl_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_srl_h_test
+;
+@llvm_mips_srl_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srl_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_srl_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srl_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srl_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_srl_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.srl.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_srl_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srl.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_srl_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_srl_w_test
+;
+@llvm_mips_srl_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srl_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_srl_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srl_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srl_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_srl_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.srl.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_srl_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srl.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_srl_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_srl_d_test
+;
+@llvm_mips_srlr_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srlr_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_srlr_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srlr_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srlr_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_srlr_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.srlr.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_srlr_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srlr.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_srlr_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srlr_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srlr_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srlr.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_srlr_b_test
+;
+@llvm_mips_srlr_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srlr_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_srlr_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srlr_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srlr_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_srlr_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.srlr.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_srlr_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srlr.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_srlr_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srlr_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srlr_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srlr.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_srlr_h_test
+;
+@llvm_mips_srlr_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srlr_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_srlr_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srlr_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srlr_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_srlr_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.srlr.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_srlr_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srlr.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_srlr_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srlr_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srlr_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srlr.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_srlr_w_test
+;
+@llvm_mips_srlr_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srlr_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_srlr_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srlr_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srlr_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_srlr_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.srlr.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_srlr_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srlr.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_srlr_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srlr_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srlr_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srlr.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_srlr_d_test
+;
+
+define void @srl_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srl_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_srl_b_ARG2
+  %2 = lshr <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_srl_b_RES
+  ret void
+}
+
+; CHECK: srl_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size srl_b_test
+
+define void @srl_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srl_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_srl_h_ARG2
+  %2 = lshr <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_srl_h_RES
+  ret void
+}
+
+; CHECK: srl_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size srl_h_test
+
+define void @srl_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srl_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_srl_w_ARG2
+  %2 = lshr <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_srl_w_RES
+  ret void
+}
+
+; CHECK: srl_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size srl_w_test
+
+define void @srl_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srl_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_srl_d_ARG2
+  %2 = lshr <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_srl_d_RES
+  ret void
+}
+
+; CHECK: srl_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_srl_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_srl_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: srl.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size srl_d_test
+
+@llvm_mips_subs_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subs_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_subs_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subs_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subs_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subs_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.subs.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subs_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subs.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_subs_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_subs_s_b_test
+;
+@llvm_mips_subs_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subs_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_subs_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subs_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subs_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subs_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.subs.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subs_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subs.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_subs_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_subs_s_h_test
+;
+@llvm_mips_subs_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subs_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_subs_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subs_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subs_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subs_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.subs.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subs_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subs.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_subs_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_subs_s_w_test
+;
+@llvm_mips_subs_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subs_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_subs_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subs_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subs_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subs_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.subs.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subs_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subs.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_subs_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_subs_s_d_test
+;
+@llvm_mips_subs_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subs_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_subs_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subs_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subs_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subs_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.subs.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subs_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subs.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_subs_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_subs_u_b_test
+;
+@llvm_mips_subs_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subs_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_subs_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subs_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subs_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subs_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.subs.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subs_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subs.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_subs_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_subs_u_h_test
+;
+@llvm_mips_subs_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subs_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_subs_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subs_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subs_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subs_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.subs.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subs_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subs.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_subs_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_subs_u_w_test
+;
+@llvm_mips_subs_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subs_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_subs_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subs_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subs_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subs_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.subs.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subs_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subs.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_subs_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subs_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subs_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subs_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_subs_u_d_test
+;
+@llvm_mips_subsus_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subsus_u_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_subsus_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subsus_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subsus_u_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subsus_u_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.subsus.u.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subsus_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subsus.u.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_subsus_u_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsus_u_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsus_u_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsus_u.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_subsus_u_b_test
+;
+@llvm_mips_subsus_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subsus_u_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_subsus_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subsus_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subsus_u_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subsus_u_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.subsus.u.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subsus_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subsus.u.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_subsus_u_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsus_u_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsus_u_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsus_u.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_subsus_u_h_test
+;
+@llvm_mips_subsus_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subsus_u_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_subsus_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subsus_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subsus_u_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subsus_u_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.subsus.u.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subsus_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subsus.u.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_subsus_u_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsus_u_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsus_u_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsus_u.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_subsus_u_w_test
+;
+@llvm_mips_subsus_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subsus_u_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_subsus_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subsus_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subsus_u_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subsus_u_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.subsus.u.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subsus_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subsus.u.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_subsus_u_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsus_u_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsus_u_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsus_u.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_subsus_u_d_test
+;
+@llvm_mips_subsuu_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subsuu_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_subsuu_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subsuu_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subsuu_s_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subsuu_s_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.subsuu.s.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subsuu_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subsuu.s.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_subsuu_s_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsuu_s_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsuu_s_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsuu_s.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_subsuu_s_b_test
+;
+@llvm_mips_subsuu_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subsuu_s_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_subsuu_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subsuu_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subsuu_s_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subsuu_s_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.subsuu.s.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subsuu_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subsuu.s.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_subsuu_s_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsuu_s_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsuu_s_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsuu_s.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_subsuu_s_h_test
+;
+@llvm_mips_subsuu_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subsuu_s_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_subsuu_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subsuu_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subsuu_s_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subsuu_s_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.subsuu.s.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subsuu_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subsuu.s.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_subsuu_s_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsuu_s_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsuu_s_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsuu_s.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_subsuu_s_w_test
+;
+@llvm_mips_subsuu_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subsuu_s_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_subsuu_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subsuu_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subsuu_s_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subsuu_s_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.subsuu.s.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subsuu_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subsuu.s.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_subsuu_s_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subsuu_s_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subsuu_s_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subsuu_s.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_subsuu_s_d_test
+;
+@llvm_mips_subv_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subv_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_subv_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subv_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.subv.b(<16 x i8> %0, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subv_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subv.b(<16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_subv_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size llvm_mips_subv_b_test
+;
+@llvm_mips_subv_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subv_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_subv_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subv_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.subv.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subv_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subv.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_subv_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size llvm_mips_subv_h_test
+;
+@llvm_mips_subv_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subv_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_subv_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subv_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.subv.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subv_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subv.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_subv_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size llvm_mips_subv_w_test
+;
+@llvm_mips_subv_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subv_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_subv_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subv_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.subv.d(<2 x i64> %0, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subv_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subv.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_subv_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size llvm_mips_subv_d_test
+;
+
+define void @subv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_subv_b_ARG2
+  %2 = sub <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_subv_b_RES
+  ret void
+}
+
+; CHECK: subv_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_b_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_b_ARG2)
+; CHECK-DAG: ld.b [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.b [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.b [[WD]]
+; CHECK: .size subv_b_test
+
+define void @subv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_subv_h_ARG2
+  %2 = sub <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_subv_h_RES
+  ret void
+}
+
+; CHECK: subv_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_h_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_h_ARG2)
+; CHECK-DAG: ld.h [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.h [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.h [[WD]]
+; CHECK: .size subv_h_test
+
+define void @subv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_subv_w_ARG2
+  %2 = sub <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_subv_w_RES
+  ret void
+}
+
+; CHECK: subv_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_w_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_w_ARG2)
+; CHECK-DAG: ld.w [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.w [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.w [[WD]]
+; CHECK: .size subv_w_test
+
+define void @subv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_subv_d_ARG2
+  %2 = sub <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_subv_d_RES
+  ret void
+}
+
+; CHECK: subv_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_subv_d_ARG1)
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_subv_d_ARG2)
+; CHECK-DAG: ld.d [[WS:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[WT:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: subv.d [[WD:\$w[0-9]+]], [[WS]], [[WT]]
+; CHECK-DAG: st.d [[WD]]
+; CHECK: .size subv_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r-v.ll b/test/CodeGen/Mips/msa/3r-v.ll
new file mode 100644
index 0000000..c9693f9
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r-v.ll
@@ -0,0 +1,105 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format.
+; There are lots of these so this covers those beginning with 'v'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_vshf_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_vshf_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_vshf_b_ARG3 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_vshf_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_vshf_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_vshf_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_vshf_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_vshf_b_ARG3
+  %3 = tail call <16 x i8> @llvm.mips.vshf.b(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_vshf_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.vshf.b(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_vshf_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: vshf.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_vshf_b_test
+;
+@llvm_mips_vshf_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_vshf_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_vshf_h_ARG3 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_vshf_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_vshf_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_vshf_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_vshf_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_vshf_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.vshf.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_vshf_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.vshf.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_vshf_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: vshf.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_vshf_h_test
+;
+@llvm_mips_vshf_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_vshf_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_vshf_w_ARG3 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_vshf_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_vshf_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_vshf_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_vshf_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_vshf_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.vshf.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_vshf_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.vshf.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_vshf_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: vshf.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_vshf_w_test
+;
+@llvm_mips_vshf_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_vshf_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_vshf_d_ARG3 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_vshf_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_vshf_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_vshf_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_vshf_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_vshf_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.vshf.d(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_vshf_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.vshf.d(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_vshf_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: vshf.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_vshf_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r_4r.ll b/test/CodeGen/Mips/msa/3r_4r.ll
new file mode 100644
index 0000000..b7fd728
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r_4r.ll
@@ -0,0 +1,206 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format and
+; use the result as a third operand.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_maddv_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_maddv_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_maddv_b_ARG3 = global <16 x i8> <i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47>, align 16
+@llvm_mips_maddv_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_maddv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_maddv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_maddv_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_maddv_b_ARG3
+  %3 = tail call <16 x i8> @llvm.mips.maddv.b(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_maddv_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.maddv.b(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_maddv_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: maddv.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_maddv_b_test
+;
+@llvm_mips_maddv_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_maddv_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_maddv_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_maddv_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_maddv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_maddv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_maddv_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_maddv_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.maddv.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_maddv_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.maddv.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_maddv_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: maddv.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_maddv_h_test
+;
+@llvm_mips_maddv_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_maddv_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_maddv_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_maddv_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_maddv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_maddv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_maddv_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_maddv_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.maddv.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_maddv_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.maddv.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_maddv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: maddv.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_maddv_w_test
+;
+@llvm_mips_maddv_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_maddv_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_maddv_d_ARG3 = global <2 x i64> <i64 4, i64 5>, align 16
+@llvm_mips_maddv_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_maddv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_maddv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_maddv_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_maddv_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.maddv.d(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_maddv_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.maddv.d(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_maddv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: maddv.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_maddv_d_test
+;
+@llvm_mips_msubv_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_msubv_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_msubv_b_ARG3 = global <16 x i8> <i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39, i8 40, i8 41, i8 42, i8 43, i8 44, i8 45, i8 46, i8 47>, align 16
+@llvm_mips_msubv_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_msubv_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_msubv_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_msubv_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_msubv_b_ARG3
+  %3 = tail call <16 x i8> @llvm.mips.msubv.b(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_msubv_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.msubv.b(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_msubv_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: msubv.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_msubv_b_test
+;
+@llvm_mips_msubv_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_msubv_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_msubv_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_msubv_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_msubv_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_msubv_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_msubv_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_msubv_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.msubv.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_msubv_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.msubv.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_msubv_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: msubv.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_msubv_h_test
+;
+@llvm_mips_msubv_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_msubv_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_msubv_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_msubv_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_msubv_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_msubv_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_msubv_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_msubv_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.msubv.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_msubv_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.msubv.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_msubv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: msubv.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_msubv_w_test
+;
+@llvm_mips_msubv_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_msubv_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_msubv_d_ARG3 = global <2 x i64> <i64 4, i64 5>, align 16
+@llvm_mips_msubv_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_msubv_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_msubv_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_msubv_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_msubv_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.msubv.d(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_msubv_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.msubv.d(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_msubv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: msubv.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_msubv_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r_4r_widen.ll b/test/CodeGen/Mips/msa/3r_4r_widen.ll
new file mode 100644
index 0000000..7063e45
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r_4r_widen.ll
@@ -0,0 +1,307 @@
+; Test the MSA intrinsics that are encoded with the 3R instruction format and
+; use the result as a third operand and results in wider elements than the
+; operands had.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_dpadd_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_dpadd_s_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
+@llvm_mips_dpadd_s_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
+@llvm_mips_dpadd_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_dpadd_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dpadd_s_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dpadd_s_h_ARG2
+  %2 = load <16 x i8>* @llvm_mips_dpadd_s_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.dpadd.s.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpadd_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dpadd.s.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dpadd_s_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.h
+; CHECK: dpadd_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dpadd_s_h_test
+;
+@llvm_mips_dpadd_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_dpadd_s_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
+@llvm_mips_dpadd_s_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
+@llvm_mips_dpadd_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_dpadd_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dpadd_s_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dpadd_s_w_ARG2
+  %2 = load <8 x i16>* @llvm_mips_dpadd_s_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.dpadd.s.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpadd_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dpadd.s.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dpadd_s_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.w
+; CHECK: dpadd_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dpadd_s_w_test
+;
+@llvm_mips_dpadd_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_dpadd_s_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
+@llvm_mips_dpadd_s_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
+@llvm_mips_dpadd_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dpadd_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_dpadd_s_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dpadd_s_d_ARG2
+  %2 = load <4 x i32>* @llvm_mips_dpadd_s_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.dpadd.s.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpadd_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dpadd.s.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dpadd_s_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.d
+; CHECK: dpadd_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dpadd_s_d_test
+;
+@llvm_mips_dpadd_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_dpadd_u_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
+@llvm_mips_dpadd_u_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
+@llvm_mips_dpadd_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_dpadd_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dpadd_u_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dpadd_u_h_ARG2
+  %2 = load <16 x i8>* @llvm_mips_dpadd_u_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.dpadd.u.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpadd_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dpadd.u.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dpadd_u_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.h
+; CHECK: dpadd_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dpadd_u_h_test
+;
+@llvm_mips_dpadd_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_dpadd_u_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
+@llvm_mips_dpadd_u_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
+@llvm_mips_dpadd_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_dpadd_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dpadd_u_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dpadd_u_w_ARG2
+  %2 = load <8 x i16>* @llvm_mips_dpadd_u_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.dpadd.u.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpadd_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dpadd.u.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dpadd_u_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.w
+; CHECK: dpadd_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dpadd_u_w_test
+;
+@llvm_mips_dpadd_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_dpadd_u_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
+@llvm_mips_dpadd_u_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
+@llvm_mips_dpadd_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dpadd_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_dpadd_u_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dpadd_u_d_ARG2
+  %2 = load <4 x i32>* @llvm_mips_dpadd_u_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.dpadd.u.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpadd_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dpadd.u.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dpadd_u_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.d
+; CHECK: dpadd_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dpadd_u_d_test
+;
+@llvm_mips_dpsub_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_dpsub_s_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
+@llvm_mips_dpsub_s_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
+@llvm_mips_dpsub_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_dpsub_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dpsub_s_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dpsub_s_h_ARG2
+  %2 = load <16 x i8>* @llvm_mips_dpsub_s_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.dpsub.s.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpsub_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dpsub.s.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dpsub_s_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.h
+; CHECK: dpsub_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dpsub_s_h_test
+;
+@llvm_mips_dpsub_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_dpsub_s_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
+@llvm_mips_dpsub_s_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
+@llvm_mips_dpsub_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_dpsub_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dpsub_s_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dpsub_s_w_ARG2
+  %2 = load <8 x i16>* @llvm_mips_dpsub_s_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.dpsub.s.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpsub_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dpsub.s.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dpsub_s_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.w
+; CHECK: dpsub_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dpsub_s_w_test
+;
+@llvm_mips_dpsub_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_dpsub_s_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
+@llvm_mips_dpsub_s_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
+@llvm_mips_dpsub_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dpsub_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_dpsub_s_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dpsub_s_d_ARG2
+  %2 = load <4 x i32>* @llvm_mips_dpsub_s_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.dpsub.s.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpsub_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dpsub.s.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dpsub_s_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.d
+; CHECK: dpsub_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dpsub_s_d_test
+;
+@llvm_mips_dpsub_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_dpsub_u_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
+@llvm_mips_dpsub_u_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
+@llvm_mips_dpsub_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_dpsub_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_dpsub_u_h_ARG1
+  %1 = load <16 x i8>* @llvm_mips_dpsub_u_h_ARG2
+  %2 = load <16 x i8>* @llvm_mips_dpsub_u_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.dpsub.u.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpsub_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.dpsub.u.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_dpsub_u_h_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: ld.h
+; CHECK: dpsub_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_dpsub_u_h_test
+;
+@llvm_mips_dpsub_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_dpsub_u_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
+@llvm_mips_dpsub_u_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
+@llvm_mips_dpsub_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_dpsub_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_dpsub_u_w_ARG1
+  %1 = load <8 x i16>* @llvm_mips_dpsub_u_w_ARG2
+  %2 = load <8 x i16>* @llvm_mips_dpsub_u_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.dpsub.u.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpsub_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.dpsub.u.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_dpsub_u_w_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.w
+; CHECK: dpsub_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_dpsub_u_w_test
+;
+@llvm_mips_dpsub_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_dpsub_u_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
+@llvm_mips_dpsub_u_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
+@llvm_mips_dpsub_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_dpsub_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_dpsub_u_d_ARG1
+  %1 = load <4 x i32>* @llvm_mips_dpsub_u_d_ARG2
+  %2 = load <4 x i32>* @llvm_mips_dpsub_u_d_ARG3
+  %3 = tail call <2 x i64> @llvm.mips.dpsub.u.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
+  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpsub_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.dpsub.u.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_dpsub_u_d_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.d
+; CHECK: dpsub_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_dpsub_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3r_splat.ll b/test/CodeGen/Mips/msa/3r_splat.ll
new file mode 100644
index 0000000..6b0cb26
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3r_splat.ll
@@ -0,0 +1,94 @@
+; Test the MSA splat intrinsics that are encoded with the 3R instruction
+; format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | \
+; RUN:     FileCheck -check-prefix=MIPS32 %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | \
+; RUN:     FileCheck -check-prefix=MIPS32 %s
+
+@llvm_mips_splat_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_splat_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_splat_b_test(i32 %a) nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_splat_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.splat.b(<16 x i8> %0, i32 %a)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_splat_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.splat.b(<16 x i8>, i32) nounwind
+
+; MIPS32: llvm_mips_splat_b_test:
+; MIPS32-DAG: lw   [[R1:\$[0-9]+]], %got(llvm_mips_splat_b_ARG1)(
+; MIPS32-DAG: lw   [[R2:\$[0-9]+]], %got(llvm_mips_splat_b_RES)(
+; MIPS32-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS32-DAG: splat.b [[R4:\$w[0-9]+]], [[R3]][$4]
+; MIPS32-DAG: st.b [[R4]], 0([[R2]])
+; MIPS32: .size llvm_mips_splat_b_test
+
+@llvm_mips_splat_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_splat_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_splat_h_test(i32 %a) nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_splat_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.splat.h(<8 x i16> %0, i32 %a)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_splat_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.splat.h(<8 x i16>, i32) nounwind
+
+; MIPS32: llvm_mips_splat_h_test:
+; MIPS32-DAG: lw   [[R1:\$[0-9]+]], %got(llvm_mips_splat_h_ARG1)(
+; MIPS32-DAG: lw   [[R2:\$[0-9]+]], %got(llvm_mips_splat_h_RES)(
+; MIPS32-DAG: ld.h [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS32-DAG: splat.h [[R4:\$w[0-9]+]], [[R3]][$4]
+; MIPS32-DAG: st.h [[R4]], 0([[R2]])
+; MIPS32: .size llvm_mips_splat_h_test
+
+@llvm_mips_splat_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_splat_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_splat_w_test(i32 %a) nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_splat_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.splat.w(<4 x i32> %0, i32 %a)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_splat_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.splat.w(<4 x i32>, i32) nounwind
+
+; MIPS32: llvm_mips_splat_w_test:
+; MIPS32-DAG: lw   [[R1:\$[0-9]+]], %got(llvm_mips_splat_w_ARG1)(
+; MIPS32-DAG: lw   [[R2:\$[0-9]+]], %got(llvm_mips_splat_w_RES)(
+; MIPS32-DAG: ld.w [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS32-DAG: splat.w [[R4:\$w[0-9]+]], [[R3]][$4]
+; MIPS32-DAG: st.w [[R4]], 0([[R2]])
+; MIPS32: .size llvm_mips_splat_w_test
+
+@llvm_mips_splat_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_splat_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_splat_d_test(i32 %a) nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_splat_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.splat.d(<2 x i64> %0, i32 %a)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_splat_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.splat.d(<2 x i64>, i32) nounwind
+
+; MIPS32: llvm_mips_splat_d_test:
+; FIXME: This test is currently disabled for MIPS32 because the indices are
+;        difficult to match. This is because 64-bit values cannot be stored in
+;        GPR32.
+; MIPS64-DAG: lw   [[R1:\$[0-9]+]], %got(llvm_mips_splat_d_ARG1)(
+; MIPS64-DAG: lw   [[R2:\$[0-9]+]], %got(llvm_mips_splat_d_RES)(
+; MIPS64-DAG: ld.d [[R3:\$w[0-9]+]], 0([[R1]])
+; MIPS64-DAG: splat.d [[R4:\$w[0-9]+]], [[R3]][$4]
+; MIPS64-DAG: st.d [[R4]], 0([[R2]])
+; MIPS32: .size llvm_mips_splat_d_test
diff --git a/test/CodeGen/Mips/msa/3rf.ll b/test/CodeGen/Mips/msa/3rf.ll
new file mode 100644
index 0000000..ae665af
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf.ll
@@ -0,0 +1,485 @@
+; Test the MSA intrinsics that are encoded with the 3RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fadd_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fadd_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fadd_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fadd_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fadd_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fadd_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fadd_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fadd.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fadd_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fadd.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fadd_w_test
+;
+@llvm_mips_fadd_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fadd_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fadd_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fadd_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fadd_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fadd_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fadd_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fadd.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fadd_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fadd.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fadd_d_test
+
+define void @fadd_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fadd_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fadd_w_ARG2
+  %2 = fadd <4 x float> %0, %1
+  store <4 x float> %2, <4 x float>* @llvm_mips_fadd_w_RES
+  ret void
+}
+
+; CHECK: fadd_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fadd.w
+; CHECK: st.w
+; CHECK: .size fadd_w_test
+
+define void @fadd_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fadd_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fadd_d_ARG2
+  %2 = fadd <2 x double> %0, %1
+  store <2 x double> %2, <2 x double>* @llvm_mips_fadd_d_RES
+  ret void
+}
+
+; CHECK: fadd_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fadd.d
+; CHECK: st.d
+; CHECK: .size fadd_d_test
+;
+@llvm_mips_fdiv_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fdiv_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fdiv_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fdiv_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fdiv_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fdiv_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fdiv.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fdiv_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fdiv.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fdiv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fdiv.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fdiv_w_test
+;
+@llvm_mips_fdiv_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fdiv_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fdiv_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fdiv_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fdiv_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fdiv_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fdiv.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fdiv_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fdiv.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fdiv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fdiv.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fdiv_d_test
+
+define void @fdiv_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fdiv_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fdiv_w_ARG2
+  %2 = fdiv <4 x float> %0, %1
+  store <4 x float> %2, <4 x float>* @llvm_mips_fdiv_w_RES
+  ret void
+}
+
+; CHECK: fdiv_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fdiv.w
+; CHECK: st.w
+; CHECK: .size fdiv_w_test
+
+define void @fdiv_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fdiv_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fdiv_d_ARG2
+  %2 = fdiv <2 x double> %0, %1
+  store <2 x double> %2, <2 x double>* @llvm_mips_fdiv_d_RES
+  ret void
+}
+
+; CHECK: fdiv_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fdiv.d
+; CHECK: st.d
+; CHECK: .size fdiv_d_test
+;
+@llvm_mips_fmin_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmin_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmin_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmin_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmin_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmin_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fmin.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmin_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmin.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmin_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmin.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmin_w_test
+;
+@llvm_mips_fmin_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmin_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmin_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmin_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmin_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmin_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fmin.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmin_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmin.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmin_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmin.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmin_d_test
+;
+@llvm_mips_fmin_a_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmin_a_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmin_a_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmin_a_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmin_a_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmin_a_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fmin.a.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmin_a_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmin.a.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmin_a_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmin_a.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmin_a_w_test
+;
+@llvm_mips_fmin_a_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmin_a_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmin_a_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmin_a_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmin_a_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmin_a_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fmin.a.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmin_a_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmin.a.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmin_a_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmin_a.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmin_a_d_test
+;
+@llvm_mips_fmax_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmax_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmax_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmax_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmax_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmax_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fmax.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmax_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmax.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmax_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmax.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmax_w_test
+;
+@llvm_mips_fmax_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmax_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmax_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmax_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmax_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmax_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fmax.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmax_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmax.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmax_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmax.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmax_d_test
+;
+@llvm_mips_fmax_a_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmax_a_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmax_a_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmax_a_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmax_a_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmax_a_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fmax.a.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmax_a_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmax.a.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmax_a_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmax_a.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmax_a_w_test
+;
+@llvm_mips_fmax_a_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmax_a_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmax_a_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmax_a_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmax_a_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmax_a_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fmax.a.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmax_a_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmax.a.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmax_a_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmax_a.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmax_a_d_test
+;
+@llvm_mips_fmul_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmul_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmul_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmul_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmul_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmul_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fmul.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmul_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmul.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmul_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmul.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmul_w_test
+;
+@llvm_mips_fmul_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmul_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmul_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmul_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmul_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmul_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fmul.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmul_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmul.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmul_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmul.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmul_d_test
+
+define void @fmul_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmul_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmul_w_ARG2
+  %2 = fmul <4 x float> %0, %1
+  store <4 x float> %2, <4 x float>* @llvm_mips_fmul_w_RES
+  ret void
+}
+
+; CHECK: fmul_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmul.w
+; CHECK: st.w
+; CHECK: .size fmul_w_test
+
+define void @fmul_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmul_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmul_d_ARG2
+  %2 = fmul <2 x double> %0, %1
+  store <2 x double> %2, <2 x double>* @llvm_mips_fmul_d_RES
+  ret void
+}
+
+; CHECK: fmul_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmul.d
+; CHECK: st.d
+; CHECK: .size fmul_d_test
+;
+@llvm_mips_fsub_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsub_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsub_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fsub_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsub_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsub_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fsub.w(<4 x float> %0, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fsub_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fsub.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsub_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsub.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsub_w_test
+;
+@llvm_mips_fsub_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsub_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsub_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fsub_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsub_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsub_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fsub.d(<2 x double> %0, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fsub_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fsub.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsub_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsub.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsub_d_test
+;
+
+define void @fsub_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsub_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsub_w_ARG2
+  %2 = fsub <4 x float> %0, %1
+  store <4 x float> %2, <4 x float>* @llvm_mips_fsub_w_RES
+  ret void
+}
+
+; CHECK: fsub_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsub.w
+; CHECK: st.w
+; CHECK: .size fsub_w_test
+
+define void @fsub_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsub_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsub_d_ARG2
+  %2 = fsub <2 x double> %0, %1
+  store <2 x double> %2, <2 x double>* @llvm_mips_fsub_d_RES
+  ret void
+}
+
+; CHECK: fsub_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsub.d
+; CHECK: st.d
+; CHECK: .size fsub_d_test
diff --git a/test/CodeGen/Mips/msa/3rf_4rf.ll b/test/CodeGen/Mips/msa/3rf_4rf.ll
new file mode 100644
index 0000000..67ef7fd
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_4rf.ll
@@ -0,0 +1,106 @@
+; Test the MSA intrinsics that are encoded with the 3RF instruction format and
+; use the result as a third operand.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fmadd_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmadd_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmadd_w_ARG3 = global <4 x float> <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>, align 16
+@llvm_mips_fmadd_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmadd_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmadd_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmadd_w_ARG2
+  %2 = load <4 x float>* @llvm_mips_fmadd_w_ARG3
+  %3 = tail call <4 x float> @llvm.mips.fmadd.w(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* @llvm_mips_fmadd_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmadd.w(<4 x float>, <4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmadd_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmadd.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmadd_w_test
+;
+@llvm_mips_fmadd_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmadd_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmadd_d_ARG3 = global <2 x double> <double 4.000000e+00, double 5.000000e+00>, align 16
+@llvm_mips_fmadd_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmadd_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmadd_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmadd_d_ARG2
+  %2 = load <2 x double>* @llvm_mips_fmadd_d_ARG3
+  %3 = tail call <2 x double> @llvm.mips.fmadd.d(<2 x double> %0, <2 x double> %1, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* @llvm_mips_fmadd_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmadd.d(<2 x double>, <2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmadd_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmadd.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmadd_d_test
+;
+@llvm_mips_fmsub_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fmsub_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fmsub_w_ARG3 = global <4 x float> <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>, align 16
+@llvm_mips_fmsub_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fmsub_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fmsub_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fmsub_w_ARG2
+  %2 = load <4 x float>* @llvm_mips_fmsub_w_ARG3
+  %3 = tail call <4 x float> @llvm.mips.fmsub.w(<4 x float> %0, <4 x float> %1, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* @llvm_mips_fmsub_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fmsub.w(<4 x float>, <4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fmsub_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fmsub.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fmsub_w_test
+;
+@llvm_mips_fmsub_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fmsub_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fmsub_d_ARG3 = global <2 x double> <double 4.000000e+00, double 5.000000e+00>, align 16
+@llvm_mips_fmsub_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fmsub_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fmsub_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fmsub_d_ARG2
+  %2 = load <2 x double>* @llvm_mips_fmsub_d_ARG3
+  %3 = tail call <2 x double> @llvm.mips.fmsub.d(<2 x double> %0, <2 x double> %1, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* @llvm_mips_fmsub_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fmsub.d(<2 x double>, <2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fmsub_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fmsub.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fmsub_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3rf_4rf_q.ll b/test/CodeGen/Mips/msa/3rf_4rf_q.ll
new file mode 100644
index 0000000..de28be0
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_4rf_q.ll
@@ -0,0 +1,206 @@
+; Test the MSA intrinsics that are encoded with the 3RF instruction format and
+; use the result as a third operand and perform fixed-point operations.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_madd_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_madd_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_madd_q_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_madd_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_madd_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_madd_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_madd_q_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_madd_q_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.madd.q.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_madd_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.madd.q.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_madd_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: madd_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_madd_q_h_test
+;
+@llvm_mips_madd_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_madd_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_madd_q_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_madd_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_madd_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_madd_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_madd_q_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_madd_q_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.madd.q.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_madd_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.madd.q.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_madd_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: madd_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_madd_q_w_test
+;
+@llvm_mips_maddr_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_maddr_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_maddr_q_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_maddr_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_maddr_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_maddr_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_maddr_q_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_maddr_q_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.maddr.q.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_maddr_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.maddr.q.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_maddr_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: maddr_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_maddr_q_h_test
+;
+@llvm_mips_maddr_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_maddr_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_maddr_q_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_maddr_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_maddr_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_maddr_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_maddr_q_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_maddr_q_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.maddr.q.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_maddr_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.maddr.q.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_maddr_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: maddr_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_maddr_q_w_test
+;
+@llvm_mips_msub_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_msub_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_msub_q_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_msub_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_msub_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_msub_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_msub_q_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_msub_q_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.msub.q.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_msub_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.msub.q.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_msub_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: msub_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_msub_q_h_test
+;
+@llvm_mips_msub_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_msub_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_msub_q_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_msub_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_msub_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_msub_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_msub_q_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_msub_q_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.msub.q.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_msub_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.msub.q.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_msub_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: msub_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_msub_q_w_test
+;
+@llvm_mips_msubr_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_msubr_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_msubr_q_h_ARG3 = global <8 x i16> <i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23>, align 16
+@llvm_mips_msubr_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_msubr_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_msubr_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_msubr_q_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_msubr_q_h_ARG3
+  %3 = tail call <8 x i16> @llvm.mips.msubr.q.h(<8 x i16> %0, <8 x i16> %1, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* @llvm_mips_msubr_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.msubr.q.h(<8 x i16>, <8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_msubr_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: msubr_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_msubr_q_h_test
+;
+@llvm_mips_msubr_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_msubr_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_msubr_q_w_ARG3 = global <4 x i32> <i32 8, i32 9, i32 10, i32 11>, align 16
+@llvm_mips_msubr_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_msubr_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_msubr_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_msubr_q_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_msubr_q_w_ARG3
+  %3 = tail call <4 x i32> @llvm.mips.msubr.q.w(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* @llvm_mips_msubr_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.msubr.q.w(<4 x i32>, <4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_msubr_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: msubr_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_msubr_q_w_test
+;
diff --git a/test/CodeGen/Mips/msa/3rf_exdo.ll b/test/CodeGen/Mips/msa/3rf_exdo.ll
new file mode 100644
index 0000000..8a7f268
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_exdo.ll
@@ -0,0 +1,50 @@
+; Test the MSA floating-point conversion intrinsics that are encoded with the
+; 3RF instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fexdo_h_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fexdo_h_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fexdo_h_RES  = global <8 x half> <half 0.000000e+00, half 0.000000e+00, half 0.000000e+00, half 0.000000e+00, half 0.000000e+00, half 0.000000e+00, half 0.000000e+00, half 0.000000e+00>, align 16
+
+define void @llvm_mips_fexdo_h_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fexdo_h_ARG1
+  %1 = load <4 x float>* @llvm_mips_fexdo_h_ARG2
+  %2 = tail call <8 x half> @llvm.mips.fexdo.h(<4 x float> %0, <4 x float> %1)
+  store <8 x half> %2, <8 x half>* @llvm_mips_fexdo_h_RES
+  ret void
+}
+
+declare <8 x half> @llvm.mips.fexdo.h(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fexdo_h_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fexdo.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_fexdo_h_test
+;
+@llvm_mips_fexdo_w_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fexdo_w_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fexdo_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fexdo_w_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fexdo_w_ARG1
+  %1 = load <2 x double>* @llvm_mips_fexdo_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fexdo.w(<2 x double> %0, <2 x double> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fexdo_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fexdo.w(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fexdo_w_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fexdo.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fexdo_w_test
+;
diff --git a/test/CodeGen/Mips/msa/3rf_float_int.ll b/test/CodeGen/Mips/msa/3rf_float_int.ll
new file mode 100644
index 0000000..7b01e17
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_float_int.ll
@@ -0,0 +1,50 @@
+; Test the MSA intrinsics that are encoded with the 3RF instruction format and
+; take an integer as an operand.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fexp2_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fexp2_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_fexp2_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
+
+define void @llvm_mips_fexp2_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fexp2_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_fexp2_w_ARG2
+  %2 = tail call <4 x float> @llvm.mips.fexp2.w(<4 x float> %0, <4 x i32> %1)
+  store <4 x float> %2, <4 x float>* @llvm_mips_fexp2_w_RES
+  ret void
+}
+
+declare <4 x float> @llvm.mips.fexp2.w(<4 x float>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_fexp2_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fexp2.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fexp2_w_test
+;
+@llvm_mips_fexp2_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fexp2_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_fexp2_d_RES  = global <2 x double> <double 0.000000e+00, double 0.000000e+00>, align 16
+
+define void @llvm_mips_fexp2_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fexp2_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_fexp2_d_ARG2
+  %2 = tail call <2 x double> @llvm.mips.fexp2.d(<2 x double> %0, <2 x i64> %1)
+  store <2 x double> %2, <2 x double>* @llvm_mips_fexp2_d_RES
+  ret void
+}
+
+declare <2 x double> @llvm.mips.fexp2.d(<2 x double>, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_fexp2_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fexp2.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fexp2_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3rf_int_float.ll b/test/CodeGen/Mips/msa/3rf_int_float.ll
new file mode 100644
index 0000000..5624771
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_int_float.ll
@@ -0,0 +1,974 @@
+; Test the MSA intrinsics that are encoded with the 3RF instruction format and
+; produce an integer as a result.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_fcaf_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcaf_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcaf_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcaf_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcaf_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcaf_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcaf.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcaf_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcaf.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcaf_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcaf.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcaf_w_test
+;
+@llvm_mips_fcaf_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcaf_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcaf_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcaf_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcaf_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcaf_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcaf.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcaf_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcaf.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcaf_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcaf.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcaf_d_test
+;
+@llvm_mips_fceq_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fceq_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fceq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fceq_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fceq_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fceq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fceq.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fceq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fceq.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fceq_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fceq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fceq_w_test
+;
+@llvm_mips_fceq_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fceq_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fceq_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fceq_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fceq_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fceq_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fceq.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fceq_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fceq.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fceq_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fceq.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fceq_d_test
+;
+@llvm_mips_fcle_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcle_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcle_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcle_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcle_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcle_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcle.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcle_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcle.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcle_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcle.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcle_w_test
+;
+@llvm_mips_fcle_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcle_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcle_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcle_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcle_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcle_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcle.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcle_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcle.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcle_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcle.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcle_d_test
+;
+@llvm_mips_fclt_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fclt_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fclt_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fclt_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fclt_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fclt_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fclt.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fclt_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fclt.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fclt_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fclt.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fclt_w_test
+;
+@llvm_mips_fclt_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fclt_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fclt_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fclt_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fclt_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fclt_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fclt.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fclt_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fclt.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fclt_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fclt.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fclt_d_test
+;
+@llvm_mips_fcor_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcor_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcor_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcor_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcor_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcor_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcor.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcor_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcor.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcor_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcor.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcor_w_test
+;
+@llvm_mips_fcor_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcor_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcor_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcor_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcor_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcor_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcor.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcor_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcor.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcor_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcor.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcor_d_test
+;
+@llvm_mips_fcne_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcne_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcne_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcne_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcne_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcne_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcne.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcne_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcne.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcne_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcne.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcne_w_test
+;
+@llvm_mips_fcne_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcne_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcne_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcne_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcne_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcne_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcne.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcne_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcne.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcne_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcne.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcne_d_test
+;
+@llvm_mips_fcueq_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcueq_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcueq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcueq_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcueq_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcueq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcueq.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcueq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcueq.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcueq_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcueq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcueq_w_test
+;
+@llvm_mips_fcueq_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcueq_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcueq_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcueq_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcueq_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcueq_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcueq.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcueq_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcueq.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcueq_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcueq.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcueq_d_test
+;
+@llvm_mips_fcult_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcult_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcult_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcult_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcult_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcult_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcult.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcult_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcult.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcult_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcult.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcult_w_test
+;
+@llvm_mips_fcult_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcult_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcult_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcult_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcult_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcult_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcult.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcult_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcult.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcult_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcult.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcult_d_test
+;
+@llvm_mips_fcule_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcule_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcule_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcule_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcule_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcule_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcule.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcule_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcule.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcule_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcule.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcule_w_test
+;
+@llvm_mips_fcule_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcule_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcule_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcule_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcule_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcule_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcule.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcule_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcule.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcule_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcule.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcule_d_test
+;
+@llvm_mips_fcun_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcun_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcun_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcun_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcun_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcun_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcun.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcun_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcun.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcun_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcun.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcun_w_test
+;
+@llvm_mips_fcun_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcun_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcun_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcun_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcun_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcun_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcun.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcun_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcun.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcun_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcun.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcun_d_test
+;
+@llvm_mips_fcune_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fcune_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fcune_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fcune_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fcune_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fcune_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fcune.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fcune_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fcune.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fcune_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fcune.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fcune_w_test
+;
+@llvm_mips_fcune_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fcune_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fcune_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fcune_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fcune_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fcune_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fcune.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fcune_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fcune.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fcune_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fcune.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fcune_d_test
+;
+@llvm_mips_fsaf_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsaf_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsaf_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsaf_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsaf_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsaf_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsaf.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsaf_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsaf.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsaf_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsaf.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsaf_w_test
+;
+@llvm_mips_fsaf_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsaf_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsaf_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsaf_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsaf_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsaf_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsaf.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsaf_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsaf.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsaf_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsaf.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsaf_d_test
+;
+@llvm_mips_fseq_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fseq_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fseq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fseq_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fseq_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fseq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fseq.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fseq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fseq.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fseq_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fseq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fseq_w_test
+;
+@llvm_mips_fseq_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fseq_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fseq_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fseq_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fseq_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fseq_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fseq.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fseq_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fseq.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fseq_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fseq.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fseq_d_test
+;
+@llvm_mips_fsle_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsle_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsle_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsle_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsle_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsle_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsle.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsle_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsle.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsle_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsle.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsle_w_test
+;
+@llvm_mips_fsle_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsle_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsle_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsle_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsle_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsle_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsle.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsle_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsle.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsle_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsle.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsle_d_test
+;
+@llvm_mips_fslt_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fslt_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fslt_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fslt_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fslt_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fslt_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fslt.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fslt_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fslt.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fslt_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fslt.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fslt_w_test
+;
+@llvm_mips_fslt_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fslt_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fslt_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fslt_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fslt_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fslt_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fslt.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fslt_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fslt.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fslt_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fslt.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fslt_d_test
+;
+@llvm_mips_fsor_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsor_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsor_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsor_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsor_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsor_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsor.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsor_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsor.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsor_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsor.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsor_w_test
+;
+@llvm_mips_fsor_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsor_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsor_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsor_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsor_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsor_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsor.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsor_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsor.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsor_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsor.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsor_d_test
+;
+@llvm_mips_fsne_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsne_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsne_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsne_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsne_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsne_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsne.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsne_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsne.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsne_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsne.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsne_w_test
+;
+@llvm_mips_fsne_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsne_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsne_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsne_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsne_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsne_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsne.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsne_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsne.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsne_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsne.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsne_d_test
+;
+@llvm_mips_fsueq_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsueq_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsueq_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsueq_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsueq_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsueq_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsueq.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsueq_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsueq.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsueq_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsueq.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsueq_w_test
+;
+@llvm_mips_fsueq_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsueq_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsueq_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsueq_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsueq_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsueq_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsueq.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsueq_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsueq.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsueq_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsueq.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsueq_d_test
+;
+@llvm_mips_fsult_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsult_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsult_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsult_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsult_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsult_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsult.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsult_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsult.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsult_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsult.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsult_w_test
+;
+@llvm_mips_fsult_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsult_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsult_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsult_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsult_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsult_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsult.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsult_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsult.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsult_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsult.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsult_d_test
+;
+@llvm_mips_fsule_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsule_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsule_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsule_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsule_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsule_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsule.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsule_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsule.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsule_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsule.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsule_w_test
+;
+@llvm_mips_fsule_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsule_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsule_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsule_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsule_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsule_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsule.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsule_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsule.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsule_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsule.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsule_d_test
+;
+@llvm_mips_fsun_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsun_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsun_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsun_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsun_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsun_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsun.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsun_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsun.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsun_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsun.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsun_w_test
+;
+@llvm_mips_fsun_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsun_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsun_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsun_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsun_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsun_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsun.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsun_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsun.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsun_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsun.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsun_d_test
+;
+@llvm_mips_fsune_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
+@llvm_mips_fsune_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
+@llvm_mips_fsune_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_fsune_w_test() nounwind {
+entry:
+  %0 = load <4 x float>* @llvm_mips_fsune_w_ARG1
+  %1 = load <4 x float>* @llvm_mips_fsune_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.fsune.w(<4 x float> %0, <4 x float> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_fsune_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.fsune.w(<4 x float>, <4 x float>) nounwind
+
+; CHECK: llvm_mips_fsune_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: fsune.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_fsune_w_test
+;
+@llvm_mips_fsune_d_ARG1 = global <2 x double> <double 0.000000e+00, double 1.000000e+00>, align 16
+@llvm_mips_fsune_d_ARG2 = global <2 x double> <double 2.000000e+00, double 3.000000e+00>, align 16
+@llvm_mips_fsune_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_fsune_d_test() nounwind {
+entry:
+  %0 = load <2 x double>* @llvm_mips_fsune_d_ARG1
+  %1 = load <2 x double>* @llvm_mips_fsune_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.fsune.d(<2 x double> %0, <2 x double> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_fsune_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.fsune.d(<2 x double>, <2 x double>) nounwind
+
+; CHECK: llvm_mips_fsune_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: fsune.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_fsune_d_test
+;
diff --git a/test/CodeGen/Mips/msa/3rf_q.ll b/test/CodeGen/Mips/msa/3rf_q.ll
new file mode 100644
index 0000000..f7000ee
--- /dev/null
+++ b/test/CodeGen/Mips/msa/3rf_q.ll
@@ -0,0 +1,94 @@
+; Test the MSA fixed-point intrinsics that are encoded with the 3RF instruction
+; format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_mul_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mul_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_mul_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mul_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mul_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mul_q_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.mul.q.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mul_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mul.q.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_mul_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mul_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mul_q_h_test
+;
+@llvm_mips_mul_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mul_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_mul_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mul_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mul_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mul_q_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.mul.q.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mul_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mul.q.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_mul_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mul_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mul_q_w_test
+;
+@llvm_mips_mulr_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mulr_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_mulr_q_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mulr_q_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mulr_q_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_mulr_q_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.mulr.q.h(<8 x i16> %0, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_mulr_q_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mulr.q.h(<8 x i16>, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_mulr_q_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: mulr_q.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mulr_q_h_test
+;
+@llvm_mips_mulr_q_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mulr_q_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_mulr_q_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mulr_q_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mulr_q_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_mulr_q_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.mulr.q.w(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_mulr_q_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mulr.q.w(<4 x i32>, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_mulr_q_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: mulr_q.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mulr_q_w_test
+;
diff --git a/test/CodeGen/Mips/msa/arithmetic.ll b/test/CodeGen/Mips/msa/arithmetic.ll
new file mode 100644
index 0000000..09ee502
--- /dev/null
+++ b/test/CodeGen/Mips/msa/arithmetic.ll
@@ -0,0 +1,726 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @add_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: add_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = add <16 x i8> %1, %2
+  ; CHECK-DAG: addv.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v16i8
+}
+
+define void @add_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: add_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = add <8 x i16> %1, %2
+  ; CHECK-DAG: addv.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v8i16
+}
+
+define void @add_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: add_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = add <4 x i32> %1, %2
+  ; CHECK-DAG: addv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v4i32
+}
+
+define void @add_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: add_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = add <2 x i64> %1, %2
+  ; CHECK-DAG: addv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v2i64
+}
+
+define void @add_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: add_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = add <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                          i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: addvi.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v16i8_i
+}
+
+define void @add_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: add_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = add <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1,
+                          i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: addvi.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v8i16_i
+}
+
+define void @add_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: add_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = add <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: addvi.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v4i32_i
+}
+
+define void @add_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: add_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = add <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: addvi.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v2i64_i
+}
+
+define void @sub_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: sub_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = sub <16 x i8> %1, %2
+  ; CHECK-DAG: subv.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v16i8
+}
+
+define void @sub_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: sub_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = sub <8 x i16> %1, %2
+  ; CHECK-DAG: subv.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v8i16
+}
+
+define void @sub_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: sub_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = sub <4 x i32> %1, %2
+  ; CHECK-DAG: subv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v4i32
+}
+
+define void @sub_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: sub_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = sub <2 x i64> %1, %2
+  ; CHECK-DAG: subv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v2i64
+}
+
+define void @sub_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: sub_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = sub <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1,
+                          i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: subvi.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v16i8_i
+}
+
+define void @sub_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: sub_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = sub <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1,
+                          i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: subvi.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v8i16_i
+}
+
+define void @sub_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: sub_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = sub <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: subvi.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v4i32_i
+}
+
+define void @sub_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: sub_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = sub <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: subvi.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v2i64_i
+}
+
+define void @mul_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: mul_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = mul <16 x i8> %1, %2
+  ; CHECK-DAG: mulv.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v16i8
+}
+
+define void @mul_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: mul_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = mul <8 x i16> %1, %2
+  ; CHECK-DAG: mulv.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v8i16
+}
+
+define void @mul_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: mul_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = mul <4 x i32> %1, %2
+  ; CHECK-DAG: mulv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v4i32
+}
+
+define void @mul_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: mul_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = mul <2 x i64> %1, %2
+  ; CHECK-DAG: mulv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v2i64
+}
+
+define void @maddv_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                         <16 x i8>* %c) nounwind {
+  ; CHECK: maddv_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <16 x i8>* %c
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <16 x i8> %2, %3
+  %5 = add <16 x i8> %4, %1
+  ; CHECK-DAG: maddv.b [[R1]], [[R2]], [[R3]]
+  store <16 x i8> %5, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size maddv_v16i8
+}
+
+define void @maddv_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                         <8 x i16>* %c) nounwind {
+  ; CHECK: maddv_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <8 x i16>* %c
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <8 x i16> %2, %3
+  %5 = add <8 x i16> %4, %1
+  ; CHECK-DAG: maddv.h [[R1]], [[R2]], [[R3]]
+  store <8 x i16> %5, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size maddv_v8i16
+}
+
+define void @maddv_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                         <4 x i32>* %c) nounwind {
+  ; CHECK: maddv_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x i32>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <4 x i32> %2, %3
+  %5 = add <4 x i32> %4, %1
+  ; CHECK-DAG: maddv.w [[R1]], [[R2]], [[R3]]
+  store <4 x i32> %5, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size maddv_v4i32
+}
+
+define void @maddv_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                         <2 x i64>* %c) nounwind {
+  ; CHECK: maddv_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x i64>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <2 x i64> %2, %3
+  %5 = add <2 x i64> %4, %1
+  ; CHECK-DAG: maddv.d [[R1]], [[R2]], [[R3]]
+  store <2 x i64> %5, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size maddv_v2i64
+}
+
+define void @msubv_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                         <16 x i8>* %c) nounwind {
+  ; CHECK: msubv_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <16 x i8>* %c
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <16 x i8> %2, %3
+  %5 = sub <16 x i8> %1, %4
+  ; CHECK-DAG: msubv.b [[R1]], [[R2]], [[R3]]
+  store <16 x i8> %5, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size msubv_v16i8
+}
+
+define void @msubv_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                         <8 x i16>* %c) nounwind {
+  ; CHECK: msubv_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <8 x i16>* %c
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <8 x i16> %2, %3
+  %5 = sub <8 x i16> %1, %4
+  ; CHECK-DAG: msubv.h [[R1]], [[R2]], [[R3]]
+  store <8 x i16> %5, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size msubv_v8i16
+}
+
+define void @msubv_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                         <4 x i32>* %c) nounwind {
+  ; CHECK: msubv_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x i32>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <4 x i32> %2, %3
+  %5 = sub <4 x i32> %1, %4
+  ; CHECK-DAG: msubv.w [[R1]], [[R2]], [[R3]]
+  store <4 x i32> %5, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size msubv_v4i32
+}
+
+define void @msubv_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                         <2 x i64>* %c) nounwind {
+  ; CHECK: msubv_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x i64>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = mul <2 x i64> %2, %3
+  %5 = sub <2 x i64> %1, %4
+  ; CHECK-DAG: msubv.d [[R1]], [[R2]], [[R3]]
+  store <2 x i64> %5, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size msubv_v2i64
+}
+
+define void @div_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: div_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = sdiv <16 x i8> %1, %2
+  ; CHECK-DAG: div_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_s_v16i8
+}
+
+define void @div_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: div_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = sdiv <8 x i16> %1, %2
+  ; CHECK-DAG: div_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_s_v8i16
+}
+
+define void @div_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: div_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = sdiv <4 x i32> %1, %2
+  ; CHECK-DAG: div_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_s_v4i32
+}
+
+define void @div_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: div_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = sdiv <2 x i64> %1, %2
+  ; CHECK-DAG: div_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_s_v2i64
+}
+
+define void @div_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: div_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = udiv <16 x i8> %1, %2
+  ; CHECK-DAG: div_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_u_v16i8
+}
+
+define void @div_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: div_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = udiv <8 x i16> %1, %2
+  ; CHECK-DAG: div_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_u_v8i16
+}
+
+define void @div_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: div_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = udiv <4 x i32> %1, %2
+  ; CHECK-DAG: div_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_u_v4i32
+}
+
+define void @div_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: div_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = udiv <2 x i64> %1, %2
+  ; CHECK-DAG: div_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size div_u_v2i64
+}
+
+define void @mod_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: mod_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = srem <16 x i8> %1, %2
+  ; CHECK-DAG: mod_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_s_v16i8
+}
+
+define void @mod_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: mod_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = srem <8 x i16> %1, %2
+  ; CHECK-DAG: mod_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_s_v8i16
+}
+
+define void @mod_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: mod_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = srem <4 x i32> %1, %2
+  ; CHECK-DAG: mod_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_s_v4i32
+}
+
+define void @mod_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: mod_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = srem <2 x i64> %1, %2
+  ; CHECK-DAG: mod_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_s_v2i64
+}
+
+define void @mod_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: mod_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = urem <16 x i8> %1, %2
+  ; CHECK-DAG: mod_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_u_v16i8
+}
+
+define void @mod_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: mod_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = urem <8 x i16> %1, %2
+  ; CHECK-DAG: mod_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_u_v8i16
+}
+
+define void @mod_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: mod_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = urem <4 x i32> %1, %2
+  ; CHECK-DAG: mod_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_u_v4i32
+}
+
+define void @mod_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: mod_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = urem <2 x i64> %1, %2
+  ; CHECK-DAG: mod_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mod_u_v2i64
+}
diff --git a/test/CodeGen/Mips/msa/arithmetic_float.ll b/test/CodeGen/Mips/msa/arithmetic_float.ll
new file mode 100644
index 0000000..dc38721
--- /dev/null
+++ b/test/CodeGen/Mips/msa/arithmetic_float.ll
@@ -0,0 +1,456 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @add_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: add_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fadd <4 x float> %1, %2
+  ; CHECK-DAG: fadd.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v4f32
+}
+
+define void @add_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: add_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fadd <2 x double> %1, %2
+  ; CHECK-DAG: fadd.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size add_v2f64
+}
+
+define void @sub_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: sub_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fsub <4 x float> %1, %2
+  ; CHECK-DAG: fsub.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v4f32
+}
+
+define void @sub_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: sub_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fsub <2 x double> %1, %2
+  ; CHECK-DAG: fsub.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sub_v2f64
+}
+
+define void @mul_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: mul_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fmul <4 x float> %1, %2
+  ; CHECK-DAG: fmul.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v4f32
+}
+
+define void @mul_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: mul_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fmul <2 x double> %1, %2
+  ; CHECK-DAG: fmul.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mul_v2f64
+}
+
+define void @fma_v4f32(<4 x float>* %d, <4 x float>* %a, <4 x float>* %b,
+                       <4 x float>* %c) nounwind {
+  ; CHECK: fma_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x float>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = tail call <4 x float> @llvm.fma.v4f32 (<4 x float> %1, <4 x float> %2,
+                                              <4 x float> %3)
+  ; CHECK-DAG: fmadd.w [[R1]], [[R2]], [[R3]]
+  store <4 x float> %4, <4 x float>* %d
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size fma_v4f32
+}
+
+define void @fma_v2f64(<2 x double>* %d, <2 x double>* %a, <2 x double>* %b,
+                       <2 x double>* %c) nounwind {
+  ; CHECK: fma_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x double>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = tail call <2 x double> @llvm.fma.v2f64 (<2 x double> %1, <2 x double> %2,
+                                               <2 x double> %3)
+  ; CHECK-DAG: fmadd.d [[R1]], [[R2]], [[R3]]
+  store <2 x double> %4, <2 x double>* %d
+  ; CHECK-DAG: st.d [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size fma_v2f64
+}
+
+define void @fmsub_v4f32(<4 x float>* %d, <4 x float>* %a, <4 x float>* %b,
+                       <4 x float>* %c) nounwind {
+  ; CHECK: fmsub_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x float>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = fmul <4 x float> %2, %3
+  %5 = fsub <4 x float> %1, %4
+  ; CHECK-DAG: fmsub.w [[R1]], [[R2]], [[R3]]
+  store <4 x float> %5, <4 x float>* %d
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size fmsub_v4f32
+}
+
+define void @fmsub_v2f64(<2 x double>* %d, <2 x double>* %a, <2 x double>* %b,
+                       <2 x double>* %c) nounwind {
+  ; CHECK: fmsub_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x double>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = fmul <2 x double> %2, %3
+  %5 = fsub <2 x double> %1, %4
+  ; CHECK-DAG: fmsub.d [[R1]], [[R2]], [[R3]]
+  store <2 x double> %5, <2 x double>* %d
+  ; CHECK-DAG: st.d [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size fmsub_v2f64
+}
+
+define void @fdiv_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: fdiv_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fdiv <4 x float> %1, %2
+  ; CHECK-DAG: fdiv.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fdiv_v4f32
+}
+
+define void @fdiv_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: fdiv_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fdiv <2 x double> %1, %2
+  ; CHECK-DAG: fdiv.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fdiv_v2f64
+}
+
+define void @fabs_v4f32(<4 x float>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: fabs_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x float> @llvm.fabs.v4f32 (<4 x float> %1)
+  ; CHECK-DAG: fmax_a.w [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <4 x float> %2, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fabs_v4f32
+}
+
+define void @fabs_v2f64(<2 x double>* %c, <2 x double>* %a) nounwind {
+  ; CHECK: fabs_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x double> @llvm.fabs.v2f64 (<2 x double> %1)
+  ; CHECK-DAG: fmax_a.d [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+  store <2 x double> %2, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fabs_v2f64
+}
+
+define void @fexp2_v4f32(<4 x float>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: fexp2_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x float> @llvm.exp2.v4f32 (<4 x float> %1)
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: ffint_u.w [[R4:\$w[0-9]+]], [[R3]]
+  ; CHECK-DAG: fexp2.w [[R4:\$w[0-9]+]], [[R3]], [[R1]]
+  store <4 x float> %2, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size fexp2_v4f32
+}
+
+define void @fexp2_v2f64(<2 x double>* %c, <2 x double>* %a) nounwind {
+  ; CHECK: fexp2_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x double> @llvm.exp2.v2f64 (<2 x double> %1)
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: ffint_u.d [[R4:\$w[0-9]+]], [[R3]]
+  ; CHECK-DAG: fexp2.d [[R4:\$w[0-9]+]], [[R3]], [[R1]]
+  store <2 x double> %2, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size fexp2_v2f64
+}
+
+define void @fexp2_v4f32_2(<4 x float>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: fexp2_v4f32_2:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x float> @llvm.exp2.v4f32 (<4 x float> %1)
+  %3 = fmul <4 x float> <float 2.0, float 2.0, float 2.0, float 2.0>, %2
+  ; CHECK-DAG: lui [[R3:\$[0-9]+]], 16384
+  ; CHECK-DAG: fill.w [[R4:\$w[0-9]+]], [[R3]]
+  ; CHECK-DAG: fexp2.w [[R5:\$w[0-9]+]], [[R4]], [[R1]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R5]], 0($4)
+
+  ret void
+  ; CHECK: .size fexp2_v4f32_2
+}
+
+define void @fexp2_v2f64_2(<2 x double>* %c, <2 x double>* %a) nounwind {
+  ; CHECK:      .8byte 4611686018427387904
+  ; CHECK-NEXT: .8byte 4611686018427387904
+  ; CHECK: fexp2_v2f64_2:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x double> @llvm.exp2.v2f64 (<2 x double> %1)
+  %3 = fmul <2 x double> <double 2.0, double 2.0>, %2
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo(
+  ; CHECK-DAG: fexp2.d [[R4:\$w[0-9]+]], [[R3]], [[R1]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size fexp2_v2f64_2
+}
+
+define void @fsqrt_v4f32(<4 x float>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: fsqrt_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x float> @llvm.sqrt.v4f32 (<4 x float> %1)
+  ; CHECK-DAG: fsqrt.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x float> %2, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fsqrt_v4f32
+}
+
+define void @fsqrt_v2f64(<2 x double>* %c, <2 x double>* %a) nounwind {
+  ; CHECK: fsqrt_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x double> @llvm.sqrt.v2f64 (<2 x double> %1)
+  ; CHECK-DAG: fsqrt.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x double> %2, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size fsqrt_v2f64
+}
+
+define void @ffint_u_v4f32(<4 x float>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: ffint_u_v4f32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = uitofp <4 x i32> %1 to <4 x float>
+  ; CHECK-DAG: ffint_u.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x float> %2, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ffint_u_v4f32
+}
+
+define void @ffint_u_v2f64(<2 x double>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: ffint_u_v2f64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = uitofp <2 x i64> %1 to <2 x double>
+  ; CHECK-DAG: ffint_u.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x double> %2, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ffint_u_v2f64
+}
+
+define void @ffint_s_v4f32(<4 x float>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: ffint_s_v4f32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = sitofp <4 x i32> %1 to <4 x float>
+  ; CHECK-DAG: ffint_s.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x float> %2, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ffint_s_v4f32
+}
+
+define void @ffint_s_v2f64(<2 x double>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: ffint_s_v2f64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = sitofp <2 x i64> %1 to <2 x double>
+  ; CHECK-DAG: ffint_s.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x double> %2, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ffint_s_v2f64
+}
+
+define void @ftrunc_u_v4f32(<4 x i32>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: ftrunc_u_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = fptoui <4 x float> %1 to <4 x i32>
+  ; CHECK-DAG: ftrunc_u.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ftrunc_u_v4f32
+}
+
+define void @ftrunc_u_v2f64(<2 x i64>* %c, <2 x double>* %a) nounwind {
+  ; CHECK: ftrunc_u_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = fptoui <2 x double> %1 to <2 x i64>
+  ; CHECK-DAG: ftrunc_u.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ftrunc_u_v2f64
+}
+
+define void @ftrunc_s_v4f32(<4 x i32>* %c, <4 x float>* %a) nounwind {
+  ; CHECK: ftrunc_s_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = fptosi <4 x float> %1 to <4 x i32>
+  ; CHECK-DAG: ftrunc_s.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ftrunc_s_v4f32
+}
+
+define void @ftrunc_s_v2f64(<2 x i64>* %c, <2 x double>* %a) nounwind {
+  ; CHECK: ftrunc_s_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = fptosi <2 x double> %1 to <2 x i64>
+  ; CHECK-DAG: ftrunc_s.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ftrunc_s_v2f64
+}
+
+declare <4 x float>  @llvm.fabs.v4f32(<4 x float>  %Val)
+declare <2 x double> @llvm.fabs.v2f64(<2 x double> %Val)
+declare <4 x float>  @llvm.exp2.v4f32(<4 x float>  %val)
+declare <2 x double> @llvm.exp2.v2f64(<2 x double> %val)
+declare <4 x float>  @llvm.fma.v4f32(<4 x float>  %a, <4 x float>  %b,
+                                     <4 x float>  %c)
+declare <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b,
+                                     <2 x double> %c)
+declare <4 x float>  @llvm.sqrt.v4f32(<4 x float>  %Val)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double> %Val)
diff --git a/test/CodeGen/Mips/msa/basic_operations.ll b/test/CodeGen/Mips/msa/basic_operations.ll
new file mode 100644
index 0000000..0169a07
--- /dev/null
+++ b/test/CodeGen/Mips/msa/basic_operations.ll
@@ -0,0 +1,481 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32-AE -check-prefix=MIPS32-BE %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32-AE -check-prefix=MIPS32-LE %s
+
+@v4i8 = global <4 x i8> <i8 0, i8 0, i8 0, i8 0>
+@v16i8 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+@v8i16 = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+@v4i32 = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+@v2i64 = global <2 x i64> <i64 0, i64 0>
+@i64 = global i64 0
+
+define void @const_v16i8() nounwind {
+  ; MIPS32-AE: const_v16i8:
+
+  store volatile <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8>*@v16i8
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
+
+  store volatile <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <16 x i8>*@v16i8
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 1
+
+  store volatile <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 31>, <16 x i8>*@v16i8
+  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6>, <16 x i8>*@v16i8
+  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <16 x i8> <i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0>, <16 x i8>*@v16i8
+  ; MIPS32-BE: ldi.h [[R1:\$w[0-9]+]], 256
+  ; MIPS32-LE: ldi.h [[R1:\$w[0-9]+]], 1
+
+  store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 1, i8 2, i8 3, i8 4, i8 1, i8 2, i8 3, i8 4, i8 1, i8 2, i8 3, i8 4>, <16 x i8>*@v16i8
+  ; MIPS32-BE-DAG: lui [[R2:\$[0-9]+]], 258
+  ; MIPS32-LE-DAG: lui [[R2:\$[0-9]+]], 1027
+  ; MIPS32-BE-DAG: ori [[R2]], [[R2]], 772
+  ; MIPS32-LE-DAG: ori [[R2]], [[R2]], 513
+  ; MIPS32-AE-DAG: fill.w [[R1:\$w[0-9]+]], [[R2]]
+
+  store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, <16 x i8>*@v16i8
+  ; MIPS32-AE: ld.b  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32-AE: .size const_v16i8
+}
+
+define void @const_v8i16() nounwind {
+  ; MIPS32-AE: const_v8i16:
+
+  store volatile <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <8 x i16>*@v8i16
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
+
+  store volatile <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, <8 x i16>*@v8i16
+  ; MIPS32-AE: ldi.h [[R1:\$w[0-9]+]], 1
+
+  store volatile <8 x i16> <i16 1, i16 1, i16 1, i16 2, i16 1, i16 1, i16 1, i16 31>, <8 x i16>*@v8i16
+  ; MIPS32-AE: ld.h  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <8 x i16> <i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028, i16 1028>, <8 x i16>*@v8i16
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 4
+
+  store volatile <8 x i16> <i16 1, i16 2, i16 1, i16 2, i16 1, i16 2, i16 1, i16 2>, <8 x i16>*@v8i16
+  ; MIPS32-BE-DAG: lui [[R2:\$[0-9]+]], 1
+  ; MIPS32-LE-DAG: lui [[R2:\$[0-9]+]], 2
+  ; MIPS32-BE-DAG: ori [[R2]], [[R2]], 2
+  ; MIPS32-LE-DAG: ori [[R2]], [[R2]], 1
+  ; MIPS32-AE-DAG: fill.w [[R1:\$w[0-9]+]], [[R2]]
+
+  store volatile <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>, <8 x i16>*@v8i16
+  ; MIPS32-AE: ld.h  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32-AE: .size const_v8i16
+}
+
+define void @const_v4i32() nounwind {
+  ; MIPS32-AE: const_v4i32:
+
+  store volatile <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
+
+  store volatile <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ldi.w [[R1:\$w[0-9]+]], 1
+
+  store volatile <4 x i32> <i32 1, i32 1, i32 1, i32 31>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <4 x i32> <i32 16843009, i32 16843009, i32 16843009, i32 16843009>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 1
+
+  store volatile <4 x i32> <i32 65537, i32 65537, i32 65537, i32 65537>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ldi.h [[R1:\$w[0-9]+]], 1
+
+  store volatile <4 x i32> <i32 1, i32 2, i32 1, i32 2>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <4 x i32> <i32 3, i32 4, i32 5, i32 6>, <4 x i32>*@v4i32
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32-AE: .size const_v4i32
+}
+
+define void @const_v2i64() nounwind {
+  ; MIPS32-AE: const_v2i64:
+
+  store volatile <2 x i64> <i64 0, i64 0>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 0
+
+  store volatile <2 x i64> <i64 72340172838076673, i64 72340172838076673>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ldi.b [[R1:\$w[0-9]+]], 1
+
+  store volatile <2 x i64> <i64 281479271743489, i64 281479271743489>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ldi.h [[R1:\$w[0-9]+]], 1
+
+  store volatile <2 x i64> <i64 4294967297, i64 4294967297>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ldi.w [[R1:\$w[0-9]+]], 1
+
+  store volatile <2 x i64> <i64 1, i64 1>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ldi.d [[R1:\$w[0-9]+]], 1
+
+  store volatile <2 x i64> <i64 1, i64 31>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x i64> <i64 3, i64 4>, <2 x i64>*@v2i64
+  ; MIPS32-AE: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32-AE: .size const_v2i64
+}
+
+define void @nonconst_v16i8(i8 %a, i8 %b, i8 %c, i8 %d, i8 %e, i8 %f, i8 %g, i8 %h) nounwind {
+  ; MIPS32-AE: nonconst_v16i8:
+
+  %1 = insertelement <16 x i8> undef, i8 %a, i32 0
+  %2 = insertelement <16 x i8> %1, i8 %b, i32 1
+  %3 = insertelement <16 x i8> %2, i8 %c, i32 2
+  %4 = insertelement <16 x i8> %3, i8 %d, i32 3
+  %5 = insertelement <16 x i8> %4, i8 %e, i32 4
+  %6 = insertelement <16 x i8> %5, i8 %f, i32 5
+  %7 = insertelement <16 x i8> %6, i8 %g, i32 6
+  %8 = insertelement <16 x i8> %7, i8 %h, i32 7
+  %9 = insertelement <16 x i8> %8, i8 %h, i32 8
+  %10 = insertelement <16 x i8> %9, i8 %h, i32 9
+  %11 = insertelement <16 x i8> %10, i8 %h, i32 10
+  %12 = insertelement <16 x i8> %11, i8 %h, i32 11
+  %13 = insertelement <16 x i8> %12, i8 %h, i32 12
+  %14 = insertelement <16 x i8> %13, i8 %h, i32 13
+  %15 = insertelement <16 x i8> %14, i8 %h, i32 14
+  %16 = insertelement <16 x i8> %15, i8 %h, i32 15
+  ; MIPS32-AE-DAG: insert.b [[R1:\$w[0-9]+]][0], $4
+  ; MIPS32-AE-DAG: insert.b [[R1]][1], $5
+  ; MIPS32-AE-DAG: insert.b [[R1]][2], $6
+  ; MIPS32-AE-DAG: insert.b [[R1]][3], $7
+  ; MIPS32-BE-DAG: lbu [[R2:\$[0-9]+]], 19($sp)
+  ; MIPS32-LE-DAG: lbu [[R2:\$[0-9]+]], 16($sp)
+  ; MIPS32-AE-DAG: insert.b [[R1]][4], [[R2]]
+  ; MIPS32-BE-DAG: lbu [[R3:\$[0-9]+]], 23($sp)
+  ; MIPS32-LE-DAG: lbu [[R3:\$[0-9]+]], 20($sp)
+  ; MIPS32-AE-DAG: insert.b [[R1]][5], [[R3]]
+  ; MIPS32-BE-DAG: lbu [[R4:\$[0-9]+]], 27($sp)
+  ; MIPS32-LE-DAG: lbu [[R4:\$[0-9]+]], 24($sp)
+  ; MIPS32-AE-DAG: insert.b [[R1]][6], [[R4]]
+  ; MIPS32-BE-DAG: lbu [[R5:\$[0-9]+]], 31($sp)
+  ; MIPS32-LE-DAG: lbu [[R5:\$[0-9]+]], 28($sp)
+  ; MIPS32-AE-DAG: insert.b [[R1]][7], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][8], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][9], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][10], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][11], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][12], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][13], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][14], [[R5]]
+  ; MIPS32-AE-DAG: insert.b [[R1]][15], [[R5]]
+
+  store volatile <16 x i8> %16, <16 x i8>*@v16i8
+
+  ret void
+  ; MIPS32-AE: .size nonconst_v16i8
+}
+
+define void @nonconst_v8i16(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h) nounwind {
+  ; MIPS32-AE: nonconst_v8i16:
+
+  %1 = insertelement <8 x i16> undef, i16 %a, i32 0
+  %2 = insertelement <8 x i16> %1, i16 %b, i32 1
+  %3 = insertelement <8 x i16> %2, i16 %c, i32 2
+  %4 = insertelement <8 x i16> %3, i16 %d, i32 3
+  %5 = insertelement <8 x i16> %4, i16 %e, i32 4
+  %6 = insertelement <8 x i16> %5, i16 %f, i32 5
+  %7 = insertelement <8 x i16> %6, i16 %g, i32 6
+  %8 = insertelement <8 x i16> %7, i16 %h, i32 7
+  ; MIPS32-AE-DAG: insert.h [[R1:\$w[0-9]+]][0], $4
+  ; MIPS32-AE-DAG: insert.h [[R1]][1], $5
+  ; MIPS32-AE-DAG: insert.h [[R1]][2], $6
+  ; MIPS32-AE-DAG: insert.h [[R1]][3], $7
+  ; MIPS32-BE-DAG: lhu [[R2:\$[0-9]+]], 18($sp)
+  ; MIPS32-LE-DAG: lhu [[R2:\$[0-9]+]], 16($sp)
+  ; MIPS32-AE-DAG: insert.h [[R1]][4], [[R2]]
+  ; MIPS32-BE-DAG: lhu [[R2:\$[0-9]+]], 22($sp)
+  ; MIPS32-LE-DAG: lhu [[R2:\$[0-9]+]], 20($sp)
+  ; MIPS32-AE-DAG: insert.h [[R1]][5], [[R2]]
+  ; MIPS32-BE-DAG: lhu [[R2:\$[0-9]+]], 26($sp)
+  ; MIPS32-LE-DAG: lhu [[R2:\$[0-9]+]], 24($sp)
+  ; MIPS32-AE-DAG: insert.h [[R1]][6], [[R2]]
+  ; MIPS32-BE-DAG: lhu [[R2:\$[0-9]+]], 30($sp)
+  ; MIPS32-LE-DAG: lhu [[R2:\$[0-9]+]], 28($sp)
+  ; MIPS32-AE-DAG: insert.h [[R1]][7], [[R2]]
+
+  store volatile <8 x i16> %8, <8 x i16>*@v8i16
+
+  ret void
+  ; MIPS32-AE: .size nonconst_v8i16
+}
+
+define void @nonconst_v4i32(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
+  ; MIPS32-AE: nonconst_v4i32:
+
+  %1 = insertelement <4 x i32> undef, i32 %a, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %b, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %c, i32 2
+  %4 = insertelement <4 x i32> %3, i32 %d, i32 3
+  ; MIPS32-AE: insert.w [[R1:\$w[0-9]+]][0], $4
+  ; MIPS32-AE: insert.w [[R1]][1], $5
+  ; MIPS32-AE: insert.w [[R1]][2], $6
+  ; MIPS32-AE: insert.w [[R1]][3], $7
+
+  store volatile <4 x i32> %4, <4 x i32>*@v4i32
+
+  ret void
+  ; MIPS32-AE: .size nonconst_v4i32
+}
+
+define void @nonconst_v2i64(i64 %a, i64 %b) nounwind {
+  ; MIPS32-AE: nonconst_v2i64:
+
+  %1 = insertelement <2 x i64> undef, i64 %a, i32 0
+  %2 = insertelement <2 x i64> %1, i64 %b, i32 1
+  ; MIPS32-AE: insert.w [[R1:\$w[0-9]+]][0], $4
+  ; MIPS32-AE: insert.w [[R1]][1], $5
+  ; MIPS32-AE: insert.w [[R1]][2], $6
+  ; MIPS32-AE: insert.w [[R1]][3], $7
+
+  store volatile <2 x i64> %2, <2 x i64>*@v2i64
+
+  ret void
+  ; MIPS32-AE: .size nonconst_v2i64
+}
+
+define i32 @extract_sext_v16i8() nounwind {
+  ; MIPS32-AE: extract_sext_v16i8:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
+
+  %2 = add <16 x i8> %1, %1
+  ; MIPS32-AE-DAG: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <16 x i8> %2, i32 1
+  %4 = sext i8 %3 to i32
+  ; MIPS32-AE-DAG: copy_s.b [[R3:\$[0-9]+]], [[R1]][1]
+  ; MIPS32-AE-NOT: sll
+  ; MIPS32-AE-NOT: sra
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_sext_v16i8
+}
+
+define i32 @extract_sext_v8i16() nounwind {
+  ; MIPS32-AE: extract_sext_v8i16:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
+
+  %2 = add <8 x i16> %1, %1
+  ; MIPS32-AE-DAG: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <8 x i16> %2, i32 1
+  %4 = sext i16 %3 to i32
+  ; MIPS32-AE-DAG: copy_s.h [[R3:\$[0-9]+]], [[R1]][1]
+  ; MIPS32-AE-NOT: sll
+  ; MIPS32-AE-NOT: sra
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_sext_v8i16
+}
+
+define i32 @extract_sext_v4i32() nounwind {
+  ; MIPS32-AE: extract_sext_v4i32:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = add <4 x i32> %1, %1
+  ; MIPS32-AE-DAG: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <4 x i32> %2, i32 1
+  ; MIPS32-AE-DAG: copy_s.w [[R3:\$[0-9]+]], [[R1]][1]
+
+  ret i32 %3
+  ; MIPS32-AE: .size extract_sext_v4i32
+}
+
+define i64 @extract_sext_v2i64() nounwind {
+  ; MIPS32-AE: extract_sext_v2i64:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = add <2 x i64> %1, %1
+  ; MIPS32-AE-DAG: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <2 x i64> %2, i32 1
+  ; MIPS32-AE-DAG: copy_s.w [[R3:\$[0-9]+]], [[R1]][2]
+  ; MIPS32-AE-DAG: copy_s.w [[R4:\$[0-9]+]], [[R1]][3]
+  ; MIPS32-AE-NOT: sll
+  ; MIPS32-AE-NOT: sra
+
+  ret i64 %3
+  ; MIPS32-AE: .size extract_sext_v2i64
+}
+
+define i32 @extract_zext_v16i8() nounwind {
+  ; MIPS32-AE: extract_zext_v16i8:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
+
+  %2 = add <16 x i8> %1, %1
+  ; MIPS32-AE-DAG: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <16 x i8> %2, i32 1
+  %4 = zext i8 %3 to i32
+  ; MIPS32-AE-DAG: copy_u.b [[R3:\$[0-9]+]], [[R1]][1]
+  ; MIPS32-AE-NOT: andi
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_zext_v16i8
+}
+
+define i32 @extract_zext_v8i16() nounwind {
+  ; MIPS32-AE: extract_zext_v8i16:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
+
+  %2 = add <8 x i16> %1, %1
+  ; MIPS32-AE-DAG: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <8 x i16> %2, i32 1
+  %4 = zext i16 %3 to i32
+  ; MIPS32-AE-DAG: copy_u.h [[R3:\$[0-9]+]], [[R1]][1]
+  ; MIPS32-AE-NOT: andi
+
+  ret i32 %4
+  ; MIPS32-AE: .size extract_zext_v8i16
+}
+
+define i32 @extract_zext_v4i32() nounwind {
+  ; MIPS32-AE: extract_zext_v4i32:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = add <4 x i32> %1, %1
+  ; MIPS32-AE-DAG: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <4 x i32> %2, i32 1
+  ; MIPS32-AE-DAG: copy_{{[su]}}.w [[R3:\$[0-9]+]], [[R1]][1]
+
+  ret i32 %3
+  ; MIPS32-AE: .size extract_zext_v4i32
+}
+
+define i64 @extract_zext_v2i64() nounwind {
+  ; MIPS32-AE: extract_zext_v2i64:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = add <2 x i64> %1, %1
+  ; MIPS32-AE-DAG: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <2 x i64> %2, i32 1
+  ; MIPS32-AE-DAG: copy_{{[su]}}.w [[R3:\$[0-9]+]], [[R1]][2]
+  ; MIPS32-AE-DAG: copy_{{[su]}}.w [[R4:\$[0-9]+]], [[R1]][3]
+  ; MIPS32-AE-NOT: andi
+
+  ret i64 %3
+  ; MIPS32-AE: .size extract_zext_v2i64
+}
+
+define void @insert_v16i8(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v16i8:
+
+  %1 = load <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]],
+
+  %a2 = trunc i32 %a to i8
+  %a3 = sext i8 %a2 to i32
+  %a4 = trunc i32 %a3 to i8
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %2 = insertelement <16 x i8> %1, i8 %a4, i32 1
+  ; MIPS32-AE-DAG: insert.b [[R1]][1], $4
+
+  store <16 x i8> %2, <16 x i8>* @v16i8
+  ; MIPS32-AE-DAG: st.b [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v16i8
+}
+
+define void @insert_v8i16(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v8i16:
+
+  %1 = load <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]],
+
+  %a2 = trunc i32 %a to i16
+  %a3 = sext i16 %a2 to i32
+  %a4 = trunc i32 %a3 to i16
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %2 = insertelement <8 x i16> %1, i16 %a4, i32 1
+  ; MIPS32-AE-DAG: insert.h [[R1]][1], $4
+
+  store <8 x i16> %2, <8 x i16>* @v8i16
+  ; MIPS32-AE-DAG: st.h [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v8i16
+}
+
+define void @insert_v4i32(i32 %a) nounwind {
+  ; MIPS32-AE: insert_v4i32:
+
+  %1 = load <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %2 = insertelement <4 x i32> %1, i32 %a, i32 1
+  ; MIPS32-AE-DAG: insert.w [[R1]][1], $4
+
+  store <4 x i32> %2, <4 x i32>* @v4i32
+  ; MIPS32-AE-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v4i32
+}
+
+define void @insert_v2i64(i64 %a) nounwind {
+  ; MIPS32-AE: insert_v2i64:
+
+  %1 = load <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  ; MIPS32-AE-NOT: andi
+  ; MIPS32-AE-NOT: sra
+
+  %2 = insertelement <2 x i64> %1, i64 %a, i32 1
+  ; MIPS32-AE-DAG: insert.w [[R1]][2], $4
+  ; MIPS32-AE-DAG: insert.w [[R1]][3], $5
+
+  store <2 x i64> %2, <2 x i64>* @v2i64
+  ; MIPS32-AE-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32-AE: .size insert_v2i64
+}
+
+define void @truncstore() nounwind {
+  ; MIPS32-AE: truncstore:
+
+  store volatile <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>, <4 x i8>*@v4i8
+  ; TODO: What code should be emitted?
+
+  ret void
+  ; MIPS32-AE: .size truncstore
+}
diff --git a/test/CodeGen/Mips/msa/basic_operations_float.ll b/test/CodeGen/Mips/msa/basic_operations_float.ll
new file mode 100644
index 0000000..1f53810
--- /dev/null
+++ b/test/CodeGen/Mips/msa/basic_operations_float.ll
@@ -0,0 +1,207 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32 %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32 %s
+
+@v4f32 = global <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
+@v2f64 = global <2 x double> <double 0.0, double 0.0>
+@f32 = global float 0.0
+@f64 = global double 0.0
+
+define void @const_v4f32() nounwind {
+  ; MIPS32: const_v4f32:
+
+  store volatile <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, <4 x float>*@v4f32
+  ; MIPS32: ldi.b  [[R1:\$w[0-9]+]], 0
+
+  store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float>*@v4f32
+  ; MIPS32: lui     [[R1:\$[0-9]+]], 16256
+  ; MIPS32: fill.w  [[R2:\$w[0-9]+]], [[R1]]
+
+  store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 31.0>, <4 x float>*@v4f32
+  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <4 x float> <float 65537.0, float 65537.0, float 65537.0, float 65537.0>, <4 x float>*@v4f32
+  ; MIPS32: lui     [[R1:\$[0-9]+]], 18304
+  ; MIPS32: ori     [[R2:\$[0-9]+]], [[R1]], 128
+  ; MIPS32: fill.w  [[R3:\$w[0-9]+]], [[R2]]
+
+  store volatile <4 x float> <float 1.0, float 2.0, float 1.0, float 2.0>, <4 x float>*@v4f32
+  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <4 x float> <float 3.0, float 4.0, float 5.0, float 6.0>, <4 x float>*@v4f32
+  ; MIPS32: ld.w  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32: .size const_v4f32
+}
+
+define void @const_v2f64() nounwind {
+  ; MIPS32: const_v2f64:
+
+  store volatile <2 x double> <double 0.0, double 0.0>, <2 x double>*@v2f64
+  ; MIPS32: ldi.b  [[R1:\$w[0-9]+]], 0
+
+  store volatile <2 x double> <double 72340172838076673.0, double 72340172838076673.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x double> <double 281479271743489.0, double 281479271743489.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x double> <double 4294967297.0, double 4294967297.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x double> <double 1.0, double 1.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x double> <double 1.0, double 31.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  store volatile <2 x double> <double 3.0, double 4.0>, <2 x double>*@v2f64
+  ; MIPS32: ld.d  [[R1:\$w[0-9]+]], %lo(
+
+  ret void
+  ; MIPS32: .size const_v2f64
+}
+
+define void @nonconst_v4f32() nounwind {
+  ; MIPS32: nonconst_v4f32:
+
+  %1 = load float *@f32
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %3 = insertelement <4 x float> %2, float %1, i32 1
+  %4 = insertelement <4 x float> %3, float %1, i32 2
+  %5 = insertelement <4 x float> %4, float %1, i32 3
+  store volatile <4 x float> %5, <4 x float>*@v4f32
+  ; MIPS32: lwc1 $f[[R1:[0-9]+]], 0(
+  ; MIPS32: splati.w [[R2:\$w[0-9]+]], $w[[R1]]
+
+  ret void
+  ; MIPS32: .size nonconst_v4f32
+}
+
+define void @nonconst_v2f64() nounwind {
+  ; MIPS32: nonconst_v2f64:
+
+  %1 = load double *@f64
+  %2 = insertelement <2 x double> undef, double %1, i32 0
+  %3 = insertelement <2 x double> %2, double %1, i32 1
+  store volatile <2 x double> %3, <2 x double>*@v2f64
+  ; MIPS32: ldc1 $f[[R1:[0-9]+]], 0(
+  ; MIPS32: splati.d [[R2:\$w[0-9]+]], $w[[R1]]
+
+  ret void
+  ; MIPS32: .size nonconst_v2f64
+}
+
+define float @extract_v4f32() nounwind {
+  ; MIPS32: extract_v4f32:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = fadd <4 x float> %1, %1
+  ; MIPS32-DAG: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <4 x float> %2, i32 1
+  ; Element 1 can be obtained by splatting it across the vector and extracting
+  ; $w0:sub_lo
+  ; MIPS32-DAG: splati.w $w0, [[R1]][1]
+
+  ret float %3
+  ; MIPS32: .size extract_v4f32
+}
+
+define float @extract_v4f32_elt0() nounwind {
+  ; MIPS32: extract_v4f32_elt0:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = fadd <4 x float> %1, %1
+  ; MIPS32-DAG: fadd.w $w0, [[R1]], [[R1]]
+
+  %3 = extractelement <4 x float> %2, i32 0
+  ; Element 0 can be obtained by extracting $w0:sub_lo ($f0)
+  ; MIPS32-NOT: copy_u.w
+  ; MIPS32-NOT: mtc1
+
+  ret float %3
+  ; MIPS32: .size extract_v4f32_elt0
+}
+
+define double @extract_v2f64() nounwind {
+  ; MIPS32: extract_v2f64:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = fadd <2 x double> %1, %1
+  ; MIPS32-DAG: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+
+  %3 = extractelement <2 x double> %2, i32 1
+  ; Element 1 can be obtained by splatting it across the vector and extracting
+  ; $w0:sub_64
+  ; MIPS32-DAG: splati.d $w0, [[R1]][1]
+  ; MIPS32-NOT: copy_u.w
+  ; MIPS32-NOT: mtc1
+  ; MIPS32-NOT: mthc1
+  ; MIPS32-NOT: sll
+  ; MIPS32-NOT: sra
+
+  ret double %3
+  ; MIPS32: .size extract_v2f64
+}
+
+define double @extract_v2f64_elt0() nounwind {
+  ; MIPS32: extract_v2f64_elt0:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = fadd <2 x double> %1, %1
+  ; MIPS32-DAG: fadd.d $w0, [[R1]], [[R1]]
+
+  %3 = extractelement <2 x double> %2, i32 0
+  ; Element 0 can be obtained by extracting $w0:sub_64 ($f0)
+  ; MIPS32-NOT: copy_u.w
+  ; MIPS32-NOT: mtc1
+  ; MIPS32-NOT: mthc1
+  ; MIPS32-NOT: sll
+  ; MIPS32-NOT: sra
+
+  ret double %3
+  ; MIPS32: .size extract_v2f64_elt0
+}
+
+define void @insert_v4f32(float %a) nounwind {
+  ; MIPS32: insert_v4f32:
+
+  %1 = load <4 x float>* @v4f32
+  ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]],
+
+  %2 = insertelement <4 x float> %1, float %a, i32 1
+  ; float argument passed in $f12
+  ; MIPS32-DAG: insve.w [[R1]][1], $w12[0]
+
+  store <4 x float> %2, <4 x float>* @v4f32
+  ; MIPS32-DAG: st.w [[R1]]
+
+  ret void
+  ; MIPS32: .size insert_v4f32
+}
+
+define void @insert_v2f64(double %a) nounwind {
+  ; MIPS32: insert_v2f64:
+
+  %1 = load <2 x double>* @v2f64
+  ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]],
+
+  %2 = insertelement <2 x double> %1, double %a, i32 1
+  ; double argument passed in $f12
+  ; MIPS32-DAG: insve.d [[R1]][1], $w12[0]
+
+  store <2 x double> %2, <2 x double>* @v2f64
+  ; MIPS32-DAG: st.d [[R1]]
+
+  ret void
+  ; MIPS32: .size insert_v2f64
+}
diff --git a/test/CodeGen/Mips/msa/bit.ll b/test/CodeGen/Mips/msa/bit.ll
new file mode 100644
index 0000000..59ddbe1
--- /dev/null
+++ b/test/CodeGen/Mips/msa/bit.ll
@@ -0,0 +1,537 @@
+; Test the MSA intrinsics that are encoded with the BIT instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_sat_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sat_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sat_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sat_s_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.sat.s.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_sat_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sat.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_sat_s_b_test:
+; CHECK: ld.b
+; CHECK: sat_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_sat_s_b_test
+;
+@llvm_mips_sat_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sat_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sat_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sat_s_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.sat.s.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_sat_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sat.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_sat_s_h_test:
+; CHECK: ld.h
+; CHECK: sat_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_sat_s_h_test
+;
+@llvm_mips_sat_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sat_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sat_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sat_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.sat.s.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_sat_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sat.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_sat_s_w_test:
+; CHECK: ld.w
+; CHECK: sat_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_sat_s_w_test
+;
+@llvm_mips_sat_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sat_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sat_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sat_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.sat.s.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_sat_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sat.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_sat_s_d_test:
+; CHECK: ld.d
+; CHECK: sat_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_sat_s_d_test
+;
+@llvm_mips_sat_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sat_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sat_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sat_u_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.sat.u.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_sat_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sat.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_sat_u_b_test:
+; CHECK: ld.b
+; CHECK: sat_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_sat_u_b_test
+;
+@llvm_mips_sat_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sat_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sat_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sat_u_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.sat.u.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_sat_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sat.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_sat_u_h_test:
+; CHECK: ld.h
+; CHECK: sat_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_sat_u_h_test
+;
+@llvm_mips_sat_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sat_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sat_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sat_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.sat.u.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_sat_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sat.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_sat_u_w_test:
+; CHECK: ld.w
+; CHECK: sat_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_sat_u_w_test
+;
+@llvm_mips_sat_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sat_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sat_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sat_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.sat.u.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_sat_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sat.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_sat_u_d_test:
+; CHECK: ld.d
+; CHECK: sat_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_sat_u_d_test
+;
+@llvm_mips_slli_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_slli_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_slli_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_slli_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.slli.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_slli_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.slli.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_slli_b_test:
+; CHECK: ld.b
+; CHECK: slli.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_slli_b_test
+;
+@llvm_mips_slli_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_slli_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_slli_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_slli_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.slli.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_slli_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.slli.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_slli_h_test:
+; CHECK: ld.h
+; CHECK: slli.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_slli_h_test
+;
+@llvm_mips_slli_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_slli_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_slli_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_slli_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.slli.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_slli_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.slli.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_slli_w_test:
+; CHECK: ld.w
+; CHECK: slli.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_slli_w_test
+;
+@llvm_mips_slli_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_slli_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_slli_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_slli_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.slli.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_slli_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.slli.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_slli_d_test:
+; CHECK: ld.d
+; CHECK: slli.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_slli_d_test
+;
+@llvm_mips_srai_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srai_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srai_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srai_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.srai.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_srai_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srai.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_srai_b_test:
+; CHECK: ld.b
+; CHECK: srai.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_srai_b_test
+;
+@llvm_mips_srai_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srai_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srai_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srai_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.srai.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_srai_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srai.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_srai_h_test:
+; CHECK: ld.h
+; CHECK: srai.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_srai_h_test
+;
+@llvm_mips_srai_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srai_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srai_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srai_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.srai.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_srai_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srai.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_srai_w_test:
+; CHECK: ld.w
+; CHECK: srai.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_srai_w_test
+;
+@llvm_mips_srai_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srai_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srai_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srai_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.srai.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_srai_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srai.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_srai_d_test:
+; CHECK: ld.d
+; CHECK: srai.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_srai_d_test
+;
+@llvm_mips_srari_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srari_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srari_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srari_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.srari.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_srari_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srari.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_srari_b_test:
+; CHECK: ld.b
+; CHECK: srari.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_srari_b_test
+;
+@llvm_mips_srari_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srari_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srari_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srari_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.srari.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_srari_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srari.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_srari_h_test:
+; CHECK: ld.h
+; CHECK: srari.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_srari_h_test
+;
+@llvm_mips_srari_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srari_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srari_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srari_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.srari.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_srari_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srari.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_srari_w_test:
+; CHECK: ld.w
+; CHECK: srari.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_srari_w_test
+;
+@llvm_mips_srari_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srari_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srari_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srari_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.srari.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_srari_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srari.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_srari_d_test:
+; CHECK: ld.d
+; CHECK: srari.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_srari_d_test
+;
+@llvm_mips_srli_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srli_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srli_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srli_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.srli.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_srli_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srli.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_srli_b_test:
+; CHECK: ld.b
+; CHECK: srli.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_srli_b_test
+;
+@llvm_mips_srli_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srli_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srli_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srli_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.srli.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_srli_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srli.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_srli_h_test:
+; CHECK: ld.h
+; CHECK: srli.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_srli_h_test
+;
+@llvm_mips_srli_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srli_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srli_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srli_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.srli.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_srli_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srli.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_srli_w_test:
+; CHECK: ld.w
+; CHECK: srli.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_srli_w_test
+;
+@llvm_mips_srli_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srli_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srli_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srli_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.srli.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_srli_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srli.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_srli_d_test:
+; CHECK: ld.d
+; CHECK: srli.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_srli_d_test
+;
+@llvm_mips_srlri_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_srlri_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_srlri_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_srlri_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.srlri.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_srlri_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.srlri.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_srlri_b_test:
+; CHECK: ld.b
+; CHECK: srlri.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_srlri_b_test
+;
+@llvm_mips_srlri_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_srlri_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_srlri_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_srlri_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.srlri.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_srlri_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.srlri.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_srlri_h_test:
+; CHECK: ld.h
+; CHECK: srlri.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_srlri_h_test
+;
+@llvm_mips_srlri_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_srlri_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_srlri_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_srlri_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.srlri.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_srlri_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.srlri.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_srlri_w_test:
+; CHECK: ld.w
+; CHECK: srlri.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_srlri_w_test
+;
+@llvm_mips_srlri_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_srlri_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_srlri_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_srlri_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.srlri.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_srlri_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.srlri.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_srlri_d_test:
+; CHECK: ld.d
+; CHECK: srlri.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_srlri_d_test
+;
diff --git a/test/CodeGen/Mips/msa/bitcast.ll b/test/CodeGen/Mips/msa/bitcast.ll
new file mode 100644
index 0000000..8e880ec
--- /dev/null
+++ b/test/CodeGen/Mips/msa/bitcast.ll
@@ -0,0 +1,1210 @@
+; Test the bitcast operation for big-endian and little-endian.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=BIGENDIAN %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=LITENDIAN %s
+
+define void @v16i8_to_v16i8(<16 x i8>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v16i8:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v16i8_to_v16i8
+
+; BIGENDIAN: v16i8_to_v16i8:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.b [[R3]],
+; BIGENDIAN: .size v16i8_to_v16i8
+
+define void @v16i8_to_v8i16(<16 x i8>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v8i16:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v16i8_to_v8i16
+
+; BIGENDIAN: v16i8_to_v8i16:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.h [[R4]],
+; BIGENDIAN: .size v16i8_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v16i8_to_v8f16(<16 x i8>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v8f16:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.b [[R2]],
+; LITENDIAN: .size v16i8_to_v8f16
+
+; BIGENDIAN: v16i8_to_v8f16:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.b [[R2]],
+; BIGENDIAN: .size v16i8_to_v8f16
+
+define void @v16i8_to_v4i32(<16 x i8>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v4i32:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v16i8_to_v4i32
+
+; BIGENDIAN: v16i8_to_v4i32:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v16i8_to_v4i32
+
+define void @v16i8_to_v4f32(<16 x i8>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v4f32:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v16i8_to_v4f32
+
+; BIGENDIAN: v16i8_to_v4f32:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v16i8_to_v4f32
+
+define void @v16i8_to_v2i64(<16 x i8>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v2i64:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v16i8_to_v2i64
+
+; BIGENDIAN: v16i8_to_v2i64:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177
+; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v16i8_to_v2i64
+
+define void @v16i8_to_v2f64(<16 x i8>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <16 x i8>* %src
+  %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0)
+  %2 = bitcast <16 x i8> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v16i8_to_v2f64:
+; LITENDIAN: ld.b [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v16i8_to_v2f64
+
+; BIGENDIAN: v16i8_to_v2f64:
+; BIGENDIAN: ld.b [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177
+; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v16i8_to_v2f64
+
+define void @v8i16_to_v16i8(<8 x i16>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v16i8:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v8i16_to_v16i8
+
+; BIGENDIAN: v8i16_to_v16i8:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v8i16_to_v16i8
+
+define void @v8i16_to_v8i16(<8 x i16>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v8i16:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v8i16_to_v8i16
+
+; BIGENDIAN: v8i16_to_v8i16:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.h [[R3]],
+; BIGENDIAN: .size v8i16_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8i16_to_v8f16(<8 x i16>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v8f16:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.h [[R2]],
+; LITENDIAN: .size v8i16_to_v8f16
+
+; BIGENDIAN: v8i16_to_v8f16:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.h [[R2]],
+; BIGENDIAN: .size v8i16_to_v8f16
+
+define void @v8i16_to_v4i32(<8 x i16>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v4i32:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v8i16_to_v4i32
+
+; BIGENDIAN: v8i16_to_v4i32:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v8i16_to_v4i32
+
+define void @v8i16_to_v4f32(<8 x i16>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v4f32:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v8i16_to_v4f32
+
+; BIGENDIAN: v8i16_to_v4f32:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v8i16_to_v4f32
+
+define void @v8i16_to_v2i64(<8 x i16>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v2i64:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v8i16_to_v2i64
+
+; BIGENDIAN: v8i16_to_v2i64:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v8i16_to_v2i64
+
+define void @v8i16_to_v2f64(<8 x i16>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x i16>* %src
+  %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0)
+  %2 = bitcast <8 x i16> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v8i16_to_v2f64:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v8i16_to_v2f64
+
+; BIGENDIAN: v8i16_to_v2f64:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v8i16_to_v2f64
+
+;----
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v16i8(<8 x half>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <16 x i8>
+  %2 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %1, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v16i8:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v8f16_to_v16i8
+
+; BIGENDIAN: v8f16_to_v16i8:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R1]], 177
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v8f16_to_v16i8
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v8i16(<8 x half>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <8 x i16>
+  %2 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %1, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v8i16:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.h [[R2]],
+; LITENDIAN: .size v8f16_to_v8i16
+
+; BIGENDIAN: v8f16_to_v8i16:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.h [[R2]],
+; BIGENDIAN: .size v8f16_to_v8i16
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v8f16(<8 x half>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <8 x half>
+  store <8 x half> %1, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v8f16:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: st.h [[R1]],
+; LITENDIAN: .size v8f16_to_v8f16
+
+; BIGENDIAN: v8f16_to_v8f16:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: st.h [[R1]],
+; BIGENDIAN: .size v8f16_to_v8f16
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v4i32(<8 x half>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <4 x i32>
+  %2 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %1, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v4i32:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.w [[R2]],
+; LITENDIAN: .size v8f16_to_v4i32
+
+; BIGENDIAN: v8f16_to_v4i32:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 177
+; BIGENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v8f16_to_v4i32
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v4f32(<8 x half>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <4 x float>
+  %2 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %1, <4 x float> %1)
+  store <4 x float> %2, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v4f32:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.w [[R2]],
+; LITENDIAN: .size v8f16_to_v4f32
+
+; BIGENDIAN: v8f16_to_v4f32:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 177
+; BIGENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v8f16_to_v4f32
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v2i64(<8 x half>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <2 x i64>
+  %2 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %1, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v2i64:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.d [[R2]],
+; LITENDIAN: .size v8f16_to_v2i64
+
+; BIGENDIAN: v8f16_to_v2i64:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 27
+; BIGENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v8f16_to_v2i64
+
+; We can't prevent the (bitcast (load X)) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v8f16_to_v2f64(<8 x half>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <8 x half>* %src
+  %1 = bitcast <8 x half> %0 to <2 x double>
+  %2 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %1, <2 x double> %1)
+  store <2 x double> %2, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v8f16_to_v2f64:
+; LITENDIAN: ld.h [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.d [[R2]],
+; LITENDIAN: .size v8f16_to_v2f64
+
+; BIGENDIAN: v8f16_to_v2f64:
+; BIGENDIAN: ld.h [[R1:\$w[0-9]+]],
+; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 27
+; BIGENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v8f16_to_v2f64
+;----
+
+define void @v4i32_to_v16i8(<4 x i32>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v16i8:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v4i32_to_v16i8
+
+; BIGENDIAN: v4i32_to_v16i8:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v4i32_to_v16i8
+
+define void @v4i32_to_v8i16(<4 x i32>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v8i16:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v4i32_to_v8i16
+
+; BIGENDIAN: v4i32_to_v8i16:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.h [[R4]],
+; BIGENDIAN: .size v4i32_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v4i32_to_v8f16(<4 x i32>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v8f16:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.w [[R2]],
+; LITENDIAN: .size v4i32_to_v8f16
+
+; BIGENDIAN: v4i32_to_v8f16:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.w [[R2]],
+; BIGENDIAN: .size v4i32_to_v8f16
+
+define void @v4i32_to_v4i32(<4 x i32>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v4i32:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v4i32_to_v4i32
+
+; BIGENDIAN: v4i32_to_v4i32:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v4i32_to_v4i32
+
+define void @v4i32_to_v4f32(<4 x i32>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v4f32:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v4i32_to_v4f32
+
+; BIGENDIAN: v4i32_to_v4f32:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v4i32_to_v4f32
+
+define void @v4i32_to_v2i64(<4 x i32>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v2i64:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v4i32_to_v2i64
+
+; BIGENDIAN: v4i32_to_v2i64:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v4i32_to_v2i64
+
+define void @v4i32_to_v2f64(<4 x i32>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x i32>* %src
+  %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0)
+  %2 = bitcast <4 x i32> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v4i32_to_v2f64:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v4i32_to_v2f64
+
+; BIGENDIAN: v4i32_to_v2f64:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v4i32_to_v2f64
+
+define void @v4f32_to_v16i8(<4 x float>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v16i8:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v4f32_to_v16i8
+
+; BIGENDIAN: v4f32_to_v16i8:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v4f32_to_v16i8
+
+define void @v4f32_to_v8i16(<4 x float>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v8i16:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v4f32_to_v8i16
+
+; BIGENDIAN: v4f32_to_v8i16:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.h [[R4]],
+; BIGENDIAN: .size v4f32_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v4f32_to_v8f16(<4 x float>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v8f16:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.w [[R2]],
+; LITENDIAN: .size v4f32_to_v8f16
+
+; BIGENDIAN: v4f32_to_v8f16:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.w [[R2]],
+; BIGENDIAN: .size v4f32_to_v8f16
+
+define void @v4f32_to_v4i32(<4 x float>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v4i32:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v4f32_to_v4i32
+
+; BIGENDIAN: v4f32_to_v4i32:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v4f32_to_v4i32
+
+define void @v4f32_to_v4f32(<4 x float>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v4f32:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v4f32_to_v4f32
+
+; BIGENDIAN: v4f32_to_v4f32:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.w [[R3]],
+; BIGENDIAN: .size v4f32_to_v4f32
+
+define void @v4f32_to_v2i64(<4 x float>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v2i64:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v4f32_to_v2i64
+
+; BIGENDIAN: v4f32_to_v2i64:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v4f32_to_v2i64
+
+define void @v4f32_to_v2f64(<4 x float>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <4 x float>* %src
+  %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0)
+  %2 = bitcast <4 x float> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v4f32_to_v2f64:
+; LITENDIAN: ld.w [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v4f32_to_v2f64
+
+; BIGENDIAN: v4f32_to_v2f64:
+; BIGENDIAN: ld.w [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.d [[R4]],
+; BIGENDIAN: .size v4f32_to_v2f64
+
+define void @v2i64_to_v16i8(<2 x i64>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v16i8:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v2i64_to_v16i8
+
+; BIGENDIAN: v2i64_to_v16i8:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v2i64_to_v16i8
+
+define void @v2i64_to_v8i16(<2 x i64>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v8i16:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v2i64_to_v8i16
+
+; BIGENDIAN: v2i64_to_v8i16:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.h [[R4]],
+; BIGENDIAN: .size v2i64_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v2i64_to_v8f16(<2 x i64>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v8f16:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.d [[R2]],
+; LITENDIAN: .size v2i64_to_v8f16
+
+; BIGENDIAN: v2i64_to_v8f16:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.d [[R2]],
+; BIGENDIAN: .size v2i64_to_v8f16
+
+define void @v2i64_to_v4i32(<2 x i64>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v4i32:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v2i64_to_v4i32
+
+; BIGENDIAN: v2i64_to_v4i32:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v2i64_to_v4i32
+
+define void @v2i64_to_v4f32(<2 x i64>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v4f32:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v2i64_to_v4f32
+
+; BIGENDIAN: v2i64_to_v4f32:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v2i64_to_v4f32
+
+define void @v2i64_to_v2i64(<2 x i64>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v2i64:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v2i64_to_v2i64
+
+; BIGENDIAN: v2i64_to_v2i64:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v2i64_to_v2i64
+
+define void @v2i64_to_v2f64(<2 x i64>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x i64>* %src
+  %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0)
+  %2 = bitcast <2 x i64> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v2i64_to_v2f64:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v2i64_to_v2f64
+
+; BIGENDIAN: v2i64_to_v2f64:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v2i64_to_v2f64
+
+define void @v2f64_to_v16i8(<2 x double>* %src, <16 x i8>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <16 x i8>
+  %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2)
+  store <16 x i8> %3, <16 x i8>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v16i8:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.b [[R3]],
+; LITENDIAN: .size v2f64_to_v16i8
+
+; BIGENDIAN: v2f64_to_v16i8:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177
+; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.b [[R4]],
+; BIGENDIAN: .size v2f64_to_v16i8
+
+define void @v2f64_to_v8i16(<2 x double>* %src, <8 x i16>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <8 x i16>
+  %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2)
+  store <8 x i16> %3, <8 x i16>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v8i16:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.h [[R3]],
+; LITENDIAN: .size v2f64_to_v8i16
+
+; BIGENDIAN: v2f64_to_v8i16:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27
+; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.h [[R4]],
+; BIGENDIAN: .size v2f64_to_v8i16
+
+; We can't prevent the (store (bitcast X), Y) DAG Combine here because there
+; are no operations for v8f16 to put in the way.
+define void @v2f64_to_v8f16(<2 x double>* %src, <8 x half>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <8 x half>
+  store <8 x half> %2, <8 x half>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v8f16:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: st.d [[R2]],
+; LITENDIAN: .size v2f64_to_v8f16
+
+; BIGENDIAN: v2f64_to_v8f16:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: st.d [[R2]],
+; BIGENDIAN: .size v2f64_to_v8f16
+
+define void @v2f64_to_v4i32(<2 x double>* %src, <4 x i32>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2)
+  store <4 x i32> %3, <4 x i32>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v4i32:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v2f64_to_v4i32
+
+; BIGENDIAN: v2f64_to_v4i32:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v2f64_to_v4i32
+
+define void @v2f64_to_v4f32(<2 x double>* %src, <4 x float>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <4 x float>
+  %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2)
+  store <4 x float> %3, <4 x float>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v4f32:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.w [[R3]],
+; LITENDIAN: .size v2f64_to_v4f32
+
+; BIGENDIAN: v2f64_to_v4f32:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177
+; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]]
+; BIGENDIAN: st.w [[R4]],
+; BIGENDIAN: .size v2f64_to_v4f32
+
+define void @v2f64_to_v2i64(<2 x double>* %src, <2 x i64>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <2 x i64>
+  %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2)
+  store <2 x i64> %3, <2 x i64>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v2i64:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v2f64_to_v2i64
+
+; BIGENDIAN: v2f64_to_v2i64:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v2f64_to_v2i64
+
+define void @v2f64_to_v2f64(<2 x double>* %src, <2 x double>* %dst) nounwind {
+entry:
+  %0 = load volatile <2 x double>* %src
+  %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0)
+  %2 = bitcast <2 x double> %1 to <2 x double>
+  %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2)
+  store <2 x double> %3, <2 x double>* %dst
+  ret void
+}
+
+; LITENDIAN: v2f64_to_v2f64:
+; LITENDIAN: ld.d [[R1:\$w[0-9]+]],
+; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; LITENDIAN: st.d [[R3]],
+; LITENDIAN: .size v2f64_to_v2f64
+
+; BIGENDIAN: v2f64_to_v2f64:
+; BIGENDIAN: ld.d [[R1:\$w[0-9]+]],
+; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]]
+; BIGENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]]
+; BIGENDIAN: st.d [[R3]],
+; BIGENDIAN: .size v2f64_to_v2f64
+
+declare <16 x i8> @llvm.mips.addv.b(<16 x i8>, <16 x i8>) nounwind
+declare <8 x i16> @llvm.mips.addv.h(<8 x i16>, <8 x i16>) nounwind
+declare <4 x i32> @llvm.mips.addv.w(<4 x i32>, <4 x i32>) nounwind
+declare <2 x i64> @llvm.mips.addv.d(<2 x i64>, <2 x i64>) nounwind
+declare <4 x float> @llvm.mips.fadd.w(<4 x float>, <4 x float>) nounwind
+declare <2 x double> @llvm.mips.fadd.d(<2 x double>, <2 x double>) nounwind
diff --git a/test/CodeGen/Mips/msa/bitwise.ll b/test/CodeGen/Mips/msa/bitwise.ll
new file mode 100644
index 0000000..9a88c47
--- /dev/null
+++ b/test/CodeGen/Mips/msa/bitwise.ll
@@ -0,0 +1,1639 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @and_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: and_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <16 x i8> %1, %2
+  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v16i8
+}
+
+define void @and_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: and_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <8 x i16> %1, %2
+  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v8i16
+}
+
+define void @and_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: and_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <4 x i32> %1, %2
+  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v4i32
+}
+
+define void @and_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: and_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <2 x i64> %1, %2
+  ; CHECK-DAG: and.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v2i64
+}
+
+define void @and_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: and_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = and <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: andi.b [[R4:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v16i8_i
+}
+
+define void @and_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: and_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = and <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: and.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v8i16_i
+}
+
+define void @and_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: and_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = and <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: and.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v4i32_i
+}
+
+define void @and_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: and_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = and <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: and.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size and_v2i64_i
+}
+
+define void @or_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: or_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <16 x i8> %1, %2
+  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v16i8
+}
+
+define void @or_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: or_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <8 x i16> %1, %2
+  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v8i16
+}
+
+define void @or_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: or_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <4 x i32> %1, %2
+  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v4i32
+}
+
+define void @or_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: or_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <2 x i64> %1, %2
+  ; CHECK-DAG: or.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v2i64
+}
+
+define void @or_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: or_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <16 x i8> %1, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ; CHECK-DAG: ori.b [[R4:\$w[0-9]+]], [[R1]], 3
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v16i8_i
+}
+
+define void @or_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: or_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: or.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v8i16_i
+}
+
+define void @or_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: or_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: or.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v4i32_i
+}
+
+define void @or_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: or_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <2 x i64> %1, <i64 3, i64 3>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: or.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size or_v2i64_i
+}
+
+define void @nor_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: nor_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <16 x i8> %1, %2
+  %4 = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v16i8
+}
+
+define void @nor_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: nor_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <8 x i16> %1, %2
+  %4 = xor <8 x i16> %3, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v8i16
+}
+
+define void @nor_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: nor_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <4 x i32> %1, %2
+  %4 = xor <4 x i32> %3, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v4i32
+}
+
+define void @nor_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: nor_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = or <2 x i64> %1, %2
+  %4 = xor <2 x i64> %3, <i64 -1, i64 -1>
+  ; CHECK-DAG: nor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v2i64
+}
+
+define void @nor_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: nor_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  ; CHECK-DAG: ori.b [[R4:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v16i8_i
+}
+
+define void @nor_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: nor_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = xor <8 x i16> %2, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: nor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v8i16_i
+}
+
+define void @nor_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: nor_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = xor <4 x i32> %2, <i32 -1, i32 -1, i32 -1, i32 -1>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: nor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v4i32_i
+}
+
+define void @nor_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: nor_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <2 x i64> %1, <i64 1, i64 1>
+  %3 = xor <2 x i64> %2, <i64 -1, i64 -1>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: nor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size nor_v2i64_i
+}
+
+define void @xor_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: xor_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = xor <16 x i8> %1, %2
+  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v16i8
+}
+
+define void @xor_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: xor_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = xor <8 x i16> %1, %2
+  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v8i16
+}
+
+define void @xor_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: xor_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = xor <4 x i32> %1, %2
+  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v4i32
+}
+
+define void @xor_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: xor_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = xor <2 x i64> %1, %2
+  ; CHECK-DAG: xor.v [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v2i64
+}
+
+define void @xor_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: xor_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <16 x i8> %1, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ; CHECK-DAG: xori.b [[R4:\$w[0-9]+]], [[R1]], 3
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v16i8_i
+}
+
+define void @xor_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: xor_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: xor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v8i16_i
+}
+
+define void @xor_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: xor_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: xor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v4i32_i
+}
+
+define void @xor_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: xor_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <2 x i64> %1, <i64 3, i64 3>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 3
+  ; CHECK-DAG: xor.v [[R4:\$w[0-9]+]], [[R1]], [[R3]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size xor_v2i64_i
+}
+
+define void @sll_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: sll_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <16 x i8> %1, %2
+  ; CHECK-DAG: sll.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v16i8
+}
+
+define void @sll_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: sll_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <8 x i16> %1, %2
+  ; CHECK-DAG: sll.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v8i16
+}
+
+define void @sll_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: sll_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <4 x i32> %1, %2
+  ; CHECK-DAG: sll.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v4i32
+}
+
+define void @sll_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: sll_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <2 x i64> %1, %2
+  ; CHECK-DAG: sll.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v2i64
+}
+
+define void @sll_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: sll_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shl <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: slli.b [[R4:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v16i8_i
+}
+
+define void @sll_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: sll_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shl <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: slli.h [[R4:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v8i16_i
+}
+
+define void @sll_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: sll_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shl <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: slli.w [[R4:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v4i32_i
+}
+
+define void @sll_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: sll_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = shl <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: slli.d [[R4:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sll_v2i64_i
+}
+
+define void @sra_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: sra_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = ashr <16 x i8> %1, %2
+  ; CHECK-DAG: sra.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v16i8
+}
+
+define void @sra_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: sra_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = ashr <8 x i16> %1, %2
+  ; CHECK-DAG: sra.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v8i16
+}
+
+define void @sra_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: sra_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = ashr <4 x i32> %1, %2
+  ; CHECK-DAG: sra.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v4i32
+}
+
+define void @sra_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: sra_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = ashr <2 x i64> %1, %2
+  ; CHECK-DAG: sra.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v2i64
+}
+
+define void @sra_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: sra_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = ashr <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: srai.b [[R4:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v16i8_i
+}
+
+define void @sra_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: sra_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = ashr <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: srai.h [[R4:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v8i16_i
+}
+
+define void @sra_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: sra_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = ashr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: srai.w [[R4:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v4i32_i
+}
+
+define void @sra_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: sra_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = ashr <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size sra_v2i64_i
+}
+
+define void @srl_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: srl_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = lshr <16 x i8> %1, %2
+  ; CHECK-DAG: srl.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v16i8
+}
+
+define void @srl_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: srl_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = lshr <8 x i16> %1, %2
+  ; CHECK-DAG: srl.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v8i16
+}
+
+define void @srl_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: srl_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = lshr <4 x i32> %1, %2
+  ; CHECK-DAG: srl.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v4i32
+}
+
+define void @srl_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: srl_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = lshr <2 x i64> %1, %2
+  ; CHECK-DAG: srl.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v2i64
+}
+
+define void @srl_v16i8_i(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: srl_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = lshr <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: srli.b [[R4:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v16i8_i
+}
+
+define void @srl_v8i16_i(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: srl_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = lshr <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: srli.h [[R4:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v8i16_i
+}
+
+define void @srl_v4i32_i(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: srl_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = lshr <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: srli.w [[R4:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v4i32_i
+}
+
+define void @srl_v2i64_i(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: srl_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = lshr <2 x i64> %1, <i64 1, i64 1>
+  ; CHECK-DAG: srli.d [[R4:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size srl_v2i64_i
+}
+
+define void @ctpop_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: ctpop_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <16 x i8> @llvm.ctpop.v16i8 (<16 x i8> %1)
+  ; CHECK-DAG: pcnt.b [[R3:\$w[0-9]+]], [[R1]]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctpop_v16i8
+}
+
+define void @ctpop_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: ctpop_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <8 x i16> @llvm.ctpop.v8i16 (<8 x i16> %1)
+  ; CHECK-DAG: pcnt.h [[R3:\$w[0-9]+]], [[R1]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctpop_v8i16
+}
+
+define void @ctpop_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: ctpop_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x i32> @llvm.ctpop.v4i32 (<4 x i32> %1)
+  ; CHECK-DAG: pcnt.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctpop_v4i32
+}
+
+define void @ctpop_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: ctpop_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x i64> @llvm.ctpop.v2i64 (<2 x i64> %1)
+  ; CHECK-DAG: pcnt.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctpop_v2i64
+}
+
+define void @ctlz_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: ctlz_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <16 x i8> @llvm.ctlz.v16i8 (<16 x i8> %1)
+  ; CHECK-DAG: nlzc.b [[R3:\$w[0-9]+]], [[R1]]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctlz_v16i8
+}
+
+define void @ctlz_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: ctlz_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <8 x i16> @llvm.ctlz.v8i16 (<8 x i16> %1)
+  ; CHECK-DAG: nlzc.h [[R3:\$w[0-9]+]], [[R1]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctlz_v8i16
+}
+
+define void @ctlz_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: ctlz_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <4 x i32> @llvm.ctlz.v4i32 (<4 x i32> %1)
+  ; CHECK-DAG: nlzc.w [[R3:\$w[0-9]+]], [[R1]]
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctlz_v4i32
+}
+
+define void @ctlz_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: ctlz_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = tail call <2 x i64> @llvm.ctlz.v2i64 (<2 x i64> %1)
+  ; CHECK-DAG: nlzc.d [[R3:\$w[0-9]+]], [[R1]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ctlz_v2i64
+}
+
+define void @bsel_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b, <16 x i8>* %m) nounwind {
+  ; CHECK: bsel_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <16 x i8>* %m
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($7)
+  %4 = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1>
+  %5 = and <16 x i8> %1, %3
+  %6 = and <16 x i8> %2, %4
+  %7 = or <16 x i8> %5, %6
+  ; bmnz is the same operation
+  ; CHECK-DAG: bmnz.v [[R1]], [[R2]], [[R3]]
+  store <16 x i8> %7, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R1]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v16i8
+}
+
+define void @bsel_v16i8_i(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %m) nounwind {
+  ; CHECK: bsel_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %m
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($6)
+  %3 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1,
+                          i8 -1, i8 -1, i8 -1, i8 -1>
+  %4 = and <16 x i8> %1, %3
+  %5 = and <16 x i8> <i8 6, i8 6, i8 6, i8 6,
+                      i8 6, i8 6, i8 6, i8 6,
+                      i8 6, i8 6, i8 6, i8 6,
+                      i8 6, i8 6, i8 6, i8 6>, %2
+  %6 = or <16 x i8> %4, %5
+  ; CHECK-DAG: bseli.b [[R3]], [[R1]], 6
+  store <16 x i8> %6, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v16i8_i
+}
+
+define void @bsel_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: bsel_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <8 x i16> %1, <i16 6, i16 6, i16 6, i16 6,
+                          i16 6, i16 6, i16 6, i16 6>
+  %4 = and <8 x i16> %2, <i16 65529, i16 65529, i16 65529, i16 65529,
+                          i16 65529, i16 65529, i16 65529, i16 65529>
+  %5 = or <8 x i16> %3, %4
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 6
+  ; CHECK-DAG: bsel.v [[R3]], [[R2]], [[R1]]
+  store <8 x i16> %5, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v8i16
+}
+
+define void @bsel_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: bsel_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <4 x i32> %1, <i32 6, i32 6, i32 6, i32 6>
+  %4 = and <4 x i32> %2, <i32 4294967289, i32 4294967289, i32 4294967289, i32 4294967289>
+  %5 = or <4 x i32> %3, %4
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 6
+  ; CHECK-DAG: bsel.v [[R3]], [[R2]], [[R1]]
+  store <4 x i32> %5, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v4i32
+}
+
+define void @bsel_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: bsel_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <2 x i64> %1, <i64 6, i64 6>
+  %4 = and <2 x i64> %2, <i64 18446744073709551609, i64 18446744073709551609>
+  %5 = or <2 x i64> %3, %4
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 6
+  ; CHECK-DAG: bsel.v [[R3]], [[R2]], [[R1]]
+  store <2 x i64> %5, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v2i64
+}
+
+define void @binsl_v16i8_i(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: binsl_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <16 x i8> %1, <i8 192, i8 192, i8 192, i8 192,
+                          i8 192, i8 192, i8 192, i8 192,
+                          i8 192, i8 192, i8 192, i8 192,
+                          i8 192, i8 192, i8 192, i8 192>
+  %4 = and <16 x i8> %2, <i8 63, i8 63, i8 63, i8 63,
+                          i8 63, i8 63, i8 63, i8 63,
+                          i8 63, i8 63, i8 63, i8 63,
+                          i8 63, i8 63, i8 63, i8 63>
+  %5 = or <16 x i8> %3, %4
+  ; CHECK-DAG: binsli.b [[R2]], [[R1]], 2
+  store <16 x i8> %5, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsl_v16i8_i
+}
+
+define void @binsl_v8i16_i(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: binsl_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <8 x i16> %1, <i16 49152, i16 49152, i16 49152, i16 49152,
+                          i16 49152, i16 49152, i16 49152, i16 49152>
+  %4 = and <8 x i16> %2, <i16 16383, i16 16383, i16 16383, i16 16383,
+                          i16 16383, i16 16383, i16 16383, i16 16383>
+  %5 = or <8 x i16> %3, %4
+  ; CHECK-DAG: binsli.h [[R2]], [[R1]], 2
+  store <8 x i16> %5, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsl_v8i16_i
+}
+
+define void @binsl_v4i32_i(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: binsl_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <4 x i32> %1, <i32 3221225472, i32 3221225472, i32 3221225472, i32 3221225472>
+  %4 = and <4 x i32> %2, <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
+  %5 = or <4 x i32> %3, %4
+  ; CHECK-DAG: binsli.w [[R2]], [[R1]], 2
+  store <4 x i32> %5, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsl_v4i32_i
+}
+
+define void @binsl_v2i64_i(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: binsl_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <2 x i64> %1, <i64 18446744073709551608, i64 18446744073709551608>
+  %4 = and <2 x i64> %2, <i64 7, i64 7>
+  %5 = or <2 x i64> %3, %4
+  ; TODO: We use a particularly wide mask here to work around a legalization
+  ;       issue. If the mask doesn't fit within a 10-bit immediate, it gets
+  ;       legalized into a constant pool. We should add a test to cover the
+  ;       other cases once they correctly select binsli.d.
+  ; CHECK-DAG: binsli.d [[R2]], [[R1]], 61
+  store <2 x i64> %5, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsl_v2i64_i
+}
+
+define void @binsr_v16i8_i(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: binsr_v16i8_i:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <16 x i8> %1, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3,
+                          i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %4 = and <16 x i8> %2, <i8 252, i8 252, i8 252, i8 252,
+                          i8 252, i8 252, i8 252, i8 252,
+                          i8 252, i8 252, i8 252, i8 252,
+                          i8 252, i8 252, i8 252, i8 252>
+  %5 = or <16 x i8> %3, %4
+  ; CHECK-DAG: binsri.b [[R2]], [[R1]], 2
+  store <16 x i8> %5, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsr_v16i8_i
+}
+
+define void @binsr_v8i16_i(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: binsr_v8i16_i:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <8 x i16> %1, <i16 3, i16 3, i16 3, i16 3,
+                          i16 3, i16 3, i16 3, i16 3>
+  %4 = and <8 x i16> %2, <i16 65532, i16 65532, i16 65532, i16 65532,
+                          i16 65532, i16 65532, i16 65532, i16 65532>
+  %5 = or <8 x i16> %3, %4
+  ; CHECK-DAG: binsri.h [[R2]], [[R1]], 2
+  store <8 x i16> %5, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsr_v8i16_i
+}
+
+define void @binsr_v4i32_i(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: binsr_v4i32_i:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
+  %4 = and <4 x i32> %2, <i32 4294967292, i32 4294967292, i32 4294967292, i32 4294967292>
+  %5 = or <4 x i32> %3, %4
+  ; CHECK-DAG: binsri.w [[R2]], [[R1]], 2
+  store <4 x i32> %5, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsr_v4i32_i
+}
+
+define void @binsr_v2i64_i(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: binsr_v2i64_i:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = and <2 x i64> %1, <i64 3, i64 3>
+  %4 = and <2 x i64> %2, <i64 18446744073709551612, i64 18446744073709551612>
+  %5 = or <2 x i64> %3, %4
+  ; CHECK-DAG: binsri.d [[R2]], [[R1]], 2
+  store <2 x i64> %5, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R2]], 0($4)
+
+  ret void
+  ; CHECK: .size binsr_v2i64_i
+}
+
+define void @bclr_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: bclr_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
+  %4 = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %5 = and <16 x i8> %1, %4
+  ; CHECK-DAG: bclr.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %5, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclr_v16i8
+}
+
+define void @bclr_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: bclr_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %2
+  %4 = xor <8 x i16> %3, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %5 = and <8 x i16> %1, %4
+  ; CHECK-DAG: bclr.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %5, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclr_v8i16
+}
+
+define void @bclr_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: bclr_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %2
+  %4 = xor <4 x i32> %3, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %5 = and <4 x i32> %1, %4
+  ; CHECK-DAG: bclr.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %5, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclr_v4i32
+}
+
+define void @bclr_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: bclr_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <2 x i64> <i64 1, i64 1>, %2
+  %4 = xor <2 x i64> %3, <i64 -1, i64 -1>
+  %5 = and <2 x i64> %1, %4
+  ; CHECK-DAG: bclr.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %5, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclr_v2i64
+}
+
+define void @bset_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: bset_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
+  %4 = or <16 x i8> %1, %3
+  ; CHECK-DAG: bset.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bset_v16i8
+}
+
+define void @bset_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: bset_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %2
+  %4 = or <8 x i16> %1, %3
+  ; CHECK-DAG: bset.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bset_v8i16
+}
+
+define void @bset_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: bset_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %2
+  %4 = or <4 x i32> %1, %3
+  ; CHECK-DAG: bset.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bset_v4i32
+}
+
+define void @bset_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: bset_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <2 x i64> <i64 1, i64 1>, %2
+  %4 = or <2 x i64> %1, %3
+  ; CHECK-DAG: bset.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bset_v2i64
+}
+
+define void @bneg_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: bneg_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
+  %4 = xor <16 x i8> %1, %3
+  ; CHECK-DAG: bneg.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bneg_v16i8
+}
+
+define void @bneg_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: bneg_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %2
+  %4 = xor <8 x i16> %1, %3
+  ; CHECK-DAG: bneg.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bneg_v8i16
+}
+
+define void @bneg_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: bneg_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %2
+  %4 = xor <4 x i32> %1, %3
+  ; CHECK-DAG: bneg.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bneg_v4i32
+}
+
+define void @bneg_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: bneg_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shl <2 x i64> <i64 1, i64 1>, %2
+  %4 = xor <2 x i64> %1, %3
+  ; CHECK-DAG: bneg.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bneg_v2i64
+}
+
+define void @bclri_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: bclri_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <16 x i8> <i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8, i8  8>,
+                     <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %3 = and <16 x i8> %1, %2
+  ; bclri.b and andi.b are exactly equivalent.
+  ; CHECK-DAG: andi.b [[R3:\$w[0-9]+]], [[R1]], 247
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclri_v16i8
+}
+
+define void @bclri_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: bclri_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <8 x i16> <i16  8, i16  8, i16  8, i16  8, i16  8, i16  8, i16  8, i16  8>,
+                     <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %3 = and <8 x i16> %1, %2
+  ; CHECK-DAG: bclri.h [[R3:\$w[0-9]+]], [[R1]], 3
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclri_v8i16
+}
+
+define void @bclri_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: bclri_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <4 x i32> <i32  8, i32  8, i32  8, i32  8>,
+                     <i32 -1, i32 -1, i32 -1, i32 -1>
+  %3 = and <4 x i32> %1, %2
+  ; CHECK-DAG: bclri.w [[R3:\$w[0-9]+]], [[R1]], 3
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclri_v4i32
+}
+
+define void @bclri_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: bclri_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <2 x i64> <i64  8, i64  8>,
+                     <i64 -1, i64 -1>
+  %3 = and <2 x i64> %1, %2
+  ; CHECK-DAG: bclri.d [[R3:\$w[0-9]+]], [[R1]], 3
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bclri_v2i64
+}
+
+define void @bseti_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: bseti_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <16 x i8> %1, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+  ; CHECK-DAG: bseti.b [[R3:\$w[0-9]+]], [[R1]], 3
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bseti_v16i8
+}
+
+define void @bseti_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: bseti_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ; CHECK-DAG: bseti.h [[R3:\$w[0-9]+]], [[R1]], 3
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bseti_v8i16
+}
+
+define void @bseti_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: bseti_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+  ; CHECK-DAG: bseti.w [[R3:\$w[0-9]+]], [[R1]], 3
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bseti_v4i32
+}
+
+define void @bseti_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: bseti_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = or <2 x i64> %1, <i64 8, i64 8>
+  ; CHECK-DAG: bseti.d [[R3:\$w[0-9]+]], [[R1]], 3
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bseti_v2i64
+}
+
+define void @bnegi_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: bnegi_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <16 x i8> %1, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+  ; CHECK-DAG: bnegi.b [[R3:\$w[0-9]+]], [[R1]], 3
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bnegi_v16i8
+}
+
+define void @bnegi_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: bnegi_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <8 x i16> %1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ; CHECK-DAG: bnegi.h [[R3:\$w[0-9]+]], [[R1]], 3
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bnegi_v8i16
+}
+
+define void @bnegi_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: bnegi_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
+  ; CHECK-DAG: bnegi.w [[R3:\$w[0-9]+]], [[R1]], 3
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bnegi_v4i32
+}
+
+define void @bnegi_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: bnegi_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = xor <2 x i64> %1, <i64 8, i64 8>
+  ; CHECK-DAG: bnegi.d [[R3:\$w[0-9]+]], [[R1]], 3
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bnegi_v2i64
+}
+
+declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %val)
+declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val)
+declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val)
+declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val)
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %val)
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %val)
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val)
+declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %val)
diff --git a/test/CodeGen/Mips/msa/compare.ll b/test/CodeGen/Mips/msa/compare.ll
new file mode 100644
index 0000000..6408d7b
--- /dev/null
+++ b/test/CodeGen/Mips/msa/compare.ll
@@ -0,0 +1,2079 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @ceq_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: ceq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp eq <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: ceq.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceq_v16i8
+}
+
+define void @ceq_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: ceq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp eq <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: ceq.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceq_v8i16
+}
+
+define void @ceq_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: ceq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp eq <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: ceq.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceq_v4i32
+}
+
+define void @ceq_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: ceq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp eq <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: ceq.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceq_v2i64
+}
+
+define void @cle_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: cle_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: cle_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_s_v16i8
+}
+
+define void @cle_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: cle_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: cle_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_s_v8i16
+}
+
+define void @cle_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: cle_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: cle_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_s_v4i32
+}
+
+define void @cle_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: cle_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: cle_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_s_v2i64
+}
+
+define void @cle_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: cle_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: cle_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_u_v16i8
+}
+
+define void @cle_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: cle_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: cle_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_u_v8i16
+}
+
+define void @cle_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: cle_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: cle_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_u_v4i32
+}
+
+define void @cle_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: cle_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: cle_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cle_u_v2i64
+}
+
+define void @clt_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: clt_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: clt_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_s_v16i8
+}
+
+define void @clt_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: clt_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: clt_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_s_v8i16
+}
+
+define void @clt_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: clt_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: clt_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_s_v4i32
+}
+
+define void @clt_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: clt_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: clt_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_s_v2i64
+}
+
+define void @clt_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: clt_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: clt_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_u_v16i8
+}
+
+define void @clt_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: clt_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: clt_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_u_v8i16
+}
+
+define void @clt_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: clt_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: clt_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_u_v4i32
+}
+
+define void @clt_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: clt_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: clt_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clt_u_v2i64
+}
+
+; There is no != comparison, but test it anyway since we've had legalizer
+; issues in this area.
+define void @cne_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: cne_v16i8:
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ne <16 x i8> %1, %2
+  %4 = sext <16 x i1> %3 to <16 x i8>
+  ; CHECK-DAG: ceq.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; CHECK-DAG: xori.b [[R3]], [[R3]], 255
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cne_v16i8
+}
+
+; There is no != comparison, but test it anyway since we've had legalizer
+; issues in this area.
+define void @cne_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: cne_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ne <8 x i16> %1, %2
+  %4 = sext <8 x i1> %3 to <8 x i16>
+  ; CHECK-DAG: ceq.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; TODO: This should be an 'xori.b [[R3]], [[R3]], 255' but thats an optimisation issue
+  ; CHECK-DAG: ldi.b [[R4:\$w[0-9]+]], -1
+  ; CHECK-DAG: xor.v [[R3]], [[R3]], [[R4]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cne_v8i16
+}
+
+; There is no != comparison, but test it anyway since we've had legalizer
+; issues in this area.
+define void @cne_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: cne_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ne <4 x i32> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: ceq.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; TODO: This should be an 'xori.b [[R3]], [[R3]], 255' but thats an optimisation issue
+  ; CHECK-DAG: ldi.b [[R4:\$w[0-9]+]], -1
+  ; CHECK-DAG: xor.v [[R3]], [[R3]], [[R4]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cne_v4i32
+}
+
+; There is no != comparison, but test it anyway since we've had legalizer
+; issues in this area.
+define void @cne_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: cne_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ne <2 x i64> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: ceq.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  ; TODO: This should be an 'xori.b [[R3]], [[R3]], 255' but thats an optimisation issue
+  ; CHECK-DAG: ldi.b [[R4:\$w[0-9]+]], -1
+  ; CHECK-DAG: xor.v [[R3]], [[R3]], [[R4]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size cne_v2i64
+}
+
+define void @ceqi_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: ceqi_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp eq <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ; CHECK-DAG: ceqi.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceqi_v16i8
+}
+
+define void @ceqi_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: ceqi_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp eq <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ; CHECK-DAG: ceqi.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceqi_v8i16
+}
+
+define void @ceqi_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: ceqi_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp eq <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ; CHECK-DAG: ceqi.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceqi_v4i32
+}
+
+define void @ceqi_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: ceqi_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp eq <2 x i64> %1, <i64 1, i64 1>
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ; CHECK-DAG: ceqi.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ceqi_v2i64
+}
+
+define void @clei_s_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: clei_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ; CHECK-DAG: clei_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_s_v16i8
+}
+
+define void @clei_s_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: clei_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ; CHECK-DAG: clei_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_s_v8i16
+}
+
+define void @clei_s_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: clei_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ; CHECK-DAG: clei_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_s_v4i32
+}
+
+define void @clei_s_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: clei_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <2 x i64> %1, <i64 1, i64 1>
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ; CHECK-DAG: clei_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_s_v2i64
+}
+
+define void @clei_u_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: clei_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ; CHECK-DAG: clei_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_u_v16i8
+}
+
+define void @clei_u_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: clei_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ; CHECK-DAG: clei_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_u_v8i16
+}
+
+define void @clei_u_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: clei_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ; CHECK-DAG: clei_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_u_v4i32
+}
+
+define void @clei_u_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: clei_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <2 x i64> %1, <i64 1, i64 1>
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ; CHECK-DAG: clei_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clei_u_v2i64
+}
+
+define void @clti_s_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: clti_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ; CHECK-DAG: clti_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_s_v16i8
+}
+
+define void @clti_s_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: clti_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ; CHECK-DAG: clti_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_s_v8i16
+}
+
+define void @clti_s_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: clti_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ; CHECK-DAG: clti_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_s_v4i32
+}
+
+define void @clti_s_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: clti_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <2 x i64> %1, <i64 1, i64 1>
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ; CHECK-DAG: clti_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_s_v2i64
+}
+
+define void @clti_u_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: clti_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = sext <16 x i1> %2 to <16 x i8>
+  ; CHECK-DAG: clti_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_u_v16i8
+}
+
+define void @clti_u_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: clti_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = sext <8 x i1> %2 to <8 x i16>
+  ; CHECK-DAG: clti_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_u_v8i16
+}
+
+define void @clti_u_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: clti_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = sext <4 x i1> %2 to <4 x i32>
+  ; CHECK-DAG: clti_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_u_v4i32
+}
+
+define void @clti_u_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: clti_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <2 x i64> %1, <i64 1, i64 1>
+  %3 = sext <2 x i1> %2 to <2 x i64>
+  ; CHECK-DAG: clti_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size clti_u_v2i64
+}
+
+define void @bsel_s_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                        <16 x i8>* %c) nounwind {
+  ; CHECK: bsel_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <16 x i8>* %c
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp sgt <16 x i8> %1, %2
+  ; CHECK-DAG: clt_s.b [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <16 x i1> %4, <16 x i8> %1, <16 x i8> %3
+  ; bmnz.v is the same operation
+  ; CHECK-DAG: bmnz.v [[R3]], [[R1]], [[R4]]
+  store <16 x i8> %5, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_s_v16i8
+}
+
+define void @bsel_s_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                        <8 x i16>* %c) nounwind {
+  ; CHECK: bsel_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <8 x i16>* %c
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp sgt <8 x i16> %1, %2
+  ; CHECK-DAG: clt_s.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <8 x i1> %4, <8 x i16> %1, <8 x i16> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <8 x i16> %5, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_s_v8i16
+}
+
+define void @bsel_s_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                        <4 x i32>* %c) nounwind {
+  ; CHECK: bsel_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x i32>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp sgt <4 x i32> %1, %2
+  ; CHECK-DAG: clt_s.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <4 x i1> %4, <4 x i32> %1, <4 x i32> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <4 x i32> %5, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_s_v4i32
+}
+
+define void @bsel_s_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                        <2 x i64>* %c) nounwind {
+  ; CHECK: bsel_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x i64>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp sgt <2 x i64> %1, %2
+  ; CHECK-DAG: clt_s.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <2 x i1> %4, <2 x i64> %1, <2 x i64> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <2 x i64> %5, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_s_v2i64
+}
+
+define void @bsel_u_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                        <16 x i8>* %c) nounwind {
+  ; CHECK: bsel_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <16 x i8>* %c
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp ugt <16 x i8> %1, %2
+  ; CHECK-DAG: clt_u.b [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <16 x i1> %4, <16 x i8> %1, <16 x i8> %3
+  ; bmnz.v is the same operation
+  ; CHECK-DAG: bmnz.v [[R3]], [[R1]], [[R4]]
+  store <16 x i8> %5, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_u_v16i8
+}
+
+define void @bsel_u_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                        <8 x i16>* %c) nounwind {
+  ; CHECK: bsel_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <8 x i16>* %c
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp ugt <8 x i16> %1, %2
+  ; CHECK-DAG: clt_u.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <8 x i1> %4, <8 x i16> %1, <8 x i16> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <8 x i16> %5, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_u_v8i16
+}
+
+define void @bsel_u_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                        <4 x i32>* %c) nounwind {
+  ; CHECK: bsel_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x i32>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp ugt <4 x i32> %1, %2
+  ; CHECK-DAG: clt_u.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <4 x i1> %4, <4 x i32> %1, <4 x i32> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <4 x i32> %5, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_u_v4i32
+}
+
+define void @bsel_u_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                        <2 x i64>* %c) nounwind {
+  ; CHECK: bsel_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x i64>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = icmp ugt <2 x i64> %1, %2
+  ; CHECK-DAG: clt_u.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <2 x i1> %4, <2 x i64> %1, <2 x i64> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <2 x i64> %5, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_u_v2i64
+}
+
+define void @bseli_s_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                        <16 x i8>* %c) nounwind {
+  ; CHECK: bseli_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <16 x i8> %1, %2
+  ; CHECK-DAG: clt_s.b [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: bseli.b [[R4]], [[R1]], 1
+  store <16 x i8> %4, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_s_v16i8
+}
+
+define void @bseli_s_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                        <8 x i16>* %c) nounwind {
+  ; CHECK: bseli_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <8 x i16> %1, %2
+  ; CHECK-DAG: clt_s.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <8 x i16> %4, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_s_v8i16
+}
+
+define void @bseli_s_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                        <4 x i32>* %c) nounwind {
+  ; CHECK: bseli_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <4 x i32> %1, %2
+  ; CHECK-DAG: clt_s.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <4 x i32> %4, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_s_v4i32
+}
+
+define void @bseli_s_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                        <2 x i64>* %c) nounwind {
+  ; CHECK: bseli_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <2 x i64> %1, %2
+  ; CHECK-DAG: clt_s.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <2 x i64> %4, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_s_v2i64
+}
+
+define void @bseli_u_v16i8(<16 x i8>* %d, <16 x i8>* %a, <16 x i8>* %b,
+                        <16 x i8>* %c) nounwind {
+  ; CHECK: bseli_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <16 x i8> %1, %2
+  ; CHECK-DAG: clt_u.b [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: bseli.b [[R4]], [[R1]], 1
+  store <16 x i8> %4, <16 x i8>* %d
+  ; CHECK-DAG: st.b [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_u_v16i8
+}
+
+define void @bseli_u_v8i16(<8 x i16>* %d, <8 x i16>* %a, <8 x i16>* %b,
+                        <8 x i16>* %c) nounwind {
+  ; CHECK: bseli_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <8 x i16> %1, %2
+  ; CHECK-DAG: clt_u.h [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: ldi.h [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <8 x i16> %4, <8 x i16>* %d
+  ; CHECK-DAG: st.h [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_u_v8i16
+}
+
+define void @bseli_u_v4i32(<4 x i32>* %d, <4 x i32>* %a, <4 x i32>* %b,
+                        <4 x i32>* %c) nounwind {
+  ; CHECK: bseli_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <4 x i32> %1, %2
+  ; CHECK-DAG: clt_u.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: ldi.w [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <4 x i32> %4, <4 x i32>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_u_v4i32
+}
+
+define void @bseli_u_v2i64(<2 x i64>* %d, <2 x i64>* %a, <2 x i64>* %b,
+                        <2 x i64>* %c) nounwind {
+  ; CHECK: bseli_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <2 x i64> %1, %2
+  ; CHECK-DAG: clt_u.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: ldi.d [[R3:\$w[0-9]+]], 1
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <2 x i64> %4, <2 x i64>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_u_v2i64
+}
+
+define void @max_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: max_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: max_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_v16i8
+}
+
+define void @max_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: max_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: max_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_v8i16
+}
+
+define void @max_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: max_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: max_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_v4i32
+}
+
+define void @max_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: max_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sgt <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: max_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_v2i64
+}
+
+define void @max_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: max_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: max_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_v16i8
+}
+
+define void @max_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: max_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: max_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_v8i16
+}
+
+define void @max_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: max_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: max_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_v4i32
+}
+
+define void @max_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: max_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ugt <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: max_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_v2i64
+}
+
+define void @max_s_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: max_s_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sge <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: max_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_eq_v16i8
+}
+
+define void @max_s_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: max_s_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sge <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: max_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_eq_v8i16
+}
+
+define void @max_s_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: max_s_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sge <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: max_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_eq_v4i32
+}
+
+define void @max_s_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: max_s_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sge <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: max_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_s_eq_v2i64
+}
+
+define void @max_u_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: max_u_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp uge <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: max_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_eq_v16i8
+}
+
+define void @max_u_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: max_u_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp uge <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: max_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_eq_v8i16
+}
+
+define void @max_u_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: max_u_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp uge <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: max_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_eq_v4i32
+}
+
+define void @max_u_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: max_u_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp uge <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: max_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_u_eq_v2i64
+}
+
+define void @maxi_s_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: maxi_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sgt <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: maxi_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_v16i8
+}
+
+define void @maxi_s_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: maxi_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sgt <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: maxi_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_v8i16
+}
+
+define void @maxi_s_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: maxi_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sgt <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: maxi_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_v4i32
+}
+
+define void @maxi_s_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: maxi_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sgt <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: maxi_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_v2i64
+}
+
+define void @maxi_u_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: maxi_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ugt <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: maxi_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_v16i8
+}
+
+define void @maxi_u_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: maxi_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ugt <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: maxi_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_v8i16
+}
+
+define void @maxi_u_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: maxi_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ugt <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: maxi_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_v4i32
+}
+
+define void @maxi_u_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: maxi_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ugt <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: maxi_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_v2i64
+}
+
+define void @maxi_s_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: maxi_s_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sge <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: maxi_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_eq_v16i8
+}
+
+define void @maxi_s_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: maxi_s_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sge <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: maxi_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_eq_v8i16
+}
+
+define void @maxi_s_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: maxi_s_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sge <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: maxi_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_eq_v4i32
+}
+
+define void @maxi_s_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: maxi_s_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sge <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: maxi_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_s_eq_v2i64
+}
+
+define void @maxi_u_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: maxi_u_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp uge <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: maxi_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_eq_v16i8
+}
+
+define void @maxi_u_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: maxi_u_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp uge <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: maxi_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_eq_v8i16
+}
+
+define void @maxi_u_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: maxi_u_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp uge <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: maxi_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_eq_v4i32
+}
+
+define void @maxi_u_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: maxi_u_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp uge <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: maxi_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size maxi_u_eq_v2i64
+}
+
+define void @min_s_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: min_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: min_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_v16i8
+}
+
+define void @min_s_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: min_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: min_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_v8i16
+}
+
+define void @min_s_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: min_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: min_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_v4i32
+}
+
+define void @min_s_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: min_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp slt <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: min_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_v2i64
+}
+
+define void @min_u_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: min_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: min_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_v16i8
+}
+
+define void @min_u_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: min_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: min_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_v8i16
+}
+
+define void @min_u_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: min_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: min_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_v4i32
+}
+
+define void @min_u_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: min_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ult <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: min_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_v2i64
+}
+
+define void @min_s_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: min_s_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: min_s.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_eq_v16i8
+}
+
+define void @min_s_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: min_s_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: min_s.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_eq_v8i16
+}
+
+define void @min_s_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: min_s_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: min_s.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_eq_v4i32
+}
+
+define void @min_s_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: min_s_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp sle <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: min_s.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_s_eq_v2i64
+}
+
+define void @min_u_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: min_u_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <16 x i8> %1, %2
+  %4 = select <16 x i1> %3, <16 x i8> %1, <16 x i8> %2
+  ; CHECK-DAG: min_u.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %4, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_eq_v16i8
+}
+
+define void @min_u_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: min_u_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <8 x i16> %1, %2
+  %4 = select <8 x i1> %3, <8 x i16> %1, <8 x i16> %2
+  ; CHECK-DAG: min_u.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %4, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_eq_v8i16
+}
+
+define void @min_u_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: min_u_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <4 x i32> %1, %2
+  %4 = select <4 x i1> %3, <4 x i32> %1, <4 x i32> %2
+  ; CHECK-DAG: min_u.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_eq_v4i32
+}
+
+define void @min_u_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: min_u_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = icmp ule <2 x i64> %1, %2
+  %4 = select <2 x i1> %3, <2 x i64> %1, <2 x i64> %2
+  ; CHECK-DAG: min_u.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_u_eq_v2i64
+}
+
+define void @mini_s_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: mini_s_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: mini_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_v16i8
+}
+
+define void @mini_s_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: mini_s_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: mini_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_v8i16
+}
+
+define void @mini_s_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: mini_s_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: mini_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_v4i32
+}
+
+define void @mini_s_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: mini_s_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp slt <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: mini_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_v2i64
+}
+
+define void @mini_u_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: mini_u_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: mini_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_v16i8
+}
+
+define void @mini_u_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: mini_u_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: mini_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_v8i16
+}
+
+define void @mini_u_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: mini_u_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: mini_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_v4i32
+}
+
+define void @mini_u_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: mini_u_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ult <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: mini_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_v2i64
+}
+
+define void @mini_s_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: mini_s_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: mini_s.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_eq_v16i8
+}
+
+define void @mini_s_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: mini_s_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: mini_s.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_eq_v8i16
+}
+
+define void @mini_s_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: mini_s_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: mini_s.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_eq_v4i32
+}
+
+define void @mini_s_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: mini_s_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp sle <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: mini_s.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_s_eq_v2i64
+}
+
+define void @mini_u_eq_v16i8(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: mini_u_eq_v16i8:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <16 x i8> %1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ; CHECK-DAG: mini_u.b [[R3:\$w[0-9]+]], [[R1]], 1
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_eq_v16i8
+}
+
+define void @mini_u_eq_v8i16(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: mini_u_eq_v8i16:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ; CHECK-DAG: mini_u.h [[R3:\$w[0-9]+]], [[R1]], 1
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_eq_v8i16
+}
+
+define void @mini_u_eq_v4i32(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: mini_u_eq_v4i32:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <4 x i32> %1, <i32 1, i32 1, i32 1, i32 1>
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: mini_u.w [[R3:\$w[0-9]+]], [[R1]], 1
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_eq_v4i32
+}
+
+define void @mini_u_eq_v2i64(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: mini_u_eq_v2i64:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = icmp ule <2 x i64> %1, <i64 1, i64 1>
+  %3 = select <2 x i1> %2, <2 x i64> %1, <2 x i64> <i64 1, i64 1>
+  ; CHECK-DAG: mini_u.d [[R3:\$w[0-9]+]], [[R1]], 1
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size mini_u_eq_v2i64
+}
diff --git a/test/CodeGen/Mips/msa/compare_float.ll b/test/CodeGen/Mips/msa/compare_float.ll
new file mode 100644
index 0000000..2fc61f8
--- /dev/null
+++ b/test/CodeGen/Mips/msa/compare_float.ll
@@ -0,0 +1,663 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+declare <4 x float> @llvm.mips.fmax.w(<4 x float>, <4 x float>) nounwind
+declare <2 x double> @llvm.mips.fmax.d(<2 x double>, <2 x double>) nounwind
+declare <4 x float> @llvm.mips.fmin.w(<4 x float>, <4 x float>) nounwind
+declare <2 x double> @llvm.mips.fmin.d(<2 x double>, <2 x double>) nounwind
+
+define void @false_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: false_v4f32:
+
+  %1 = load <4 x float>* %a
+  %2 = load <4 x float>* %b
+  %3 = fcmp false <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  store <4 x i32> %4, <4 x i32>* %c
+  ret void
+
+  ; (setcc $a, $b, SETFALSE) is always folded, so we won't get fcaf:
+  ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], 0
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+  ; CHECK: .size false_v4f32
+}
+
+define void @false_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: false_v2f64:
+
+  %1 = load <2 x double>* %a
+  %2 = load <2 x double>* %b
+  %3 = fcmp false <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  store <2 x i64> %4, <2 x i64>* %c
+  ret void
+
+  ; FIXME: This code is correct, but poor. Ideally it would be similar to
+  ;        the code in @false_v4f32
+  ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], 0
+  ; CHECK-DAG: slli.d [[R3:\$w[0-9]+]], [[R1]], 63
+  ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R3]], 63
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+  ; CHECK: .size false_v2f64
+}
+
+define void @oeq_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: oeq_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp oeq <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fceq.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size oeq_v4f32
+}
+
+define void @oeq_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: oeq_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp oeq <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fceq.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size oeq_v2f64
+}
+
+define void @oge_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: oge_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp oge <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcle.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size oge_v4f32
+}
+
+define void @oge_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: oge_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp oge <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcle.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size oge_v2f64
+}
+
+define void @ogt_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ogt_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ogt <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fclt.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ogt_v4f32
+}
+
+define void @ogt_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ogt_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ogt <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fclt.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ogt_v2f64
+}
+
+define void @ole_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ole_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ole <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcle.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ole_v4f32
+}
+
+define void @ole_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ole_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ole <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcle.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ole_v2f64
+}
+
+define void @olt_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: olt_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp olt <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fclt.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size olt_v4f32
+}
+
+define void @olt_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: olt_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp olt <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fclt.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size olt_v2f64
+}
+
+define void @one_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: one_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp one <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcne.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size one_v4f32
+}
+
+define void @one_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: one_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp one <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcne.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size one_v2f64
+}
+
+define void @ord_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ord_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ord <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcor.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ord_v4f32
+}
+
+define void @ord_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ord_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ord <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcor.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ord_v2f64
+}
+
+define void @ueq_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ueq_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ueq <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcueq.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ueq_v4f32
+}
+
+define void @ueq_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ueq_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ueq <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcueq.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ueq_v2f64
+}
+
+define void @uge_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: uge_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp uge <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcule.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size uge_v4f32
+}
+
+define void @uge_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: uge_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp uge <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcule.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size uge_v2f64
+}
+
+define void @ugt_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ugt_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ugt <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcult.w [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ugt_v4f32
+}
+
+define void @ugt_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ugt_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ugt <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcult.d [[R3:\$w[0-9]+]], [[R2]], [[R1]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ugt_v2f64
+}
+
+define void @ule_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ule_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ule <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcule.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ule_v4f32
+}
+
+define void @ule_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ule_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ule <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcule.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ule_v2f64
+}
+
+define void @ult_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: ult_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ult <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcult.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ult_v4f32
+}
+
+define void @ult_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: ult_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ult <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcult.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ult_v2f64
+}
+
+define void @uno_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: uno_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp uno <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  ; CHECK-DAG: fcun.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %4, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size uno_v4f32
+}
+
+define void @uno_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: uno_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp uno <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  ; CHECK-DAG: fcun.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %4, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size uno_v2f64
+}
+
+define void @true_v4f32(<4 x i32>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: true_v4f32:
+
+  %1 = load <4 x float>* %a
+  %2 = load <4 x float>* %b
+  %3 = fcmp true <4 x float> %1, %2
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  store <4 x i32> %4, <4 x i32>* %c
+  ret void
+
+  ; (setcc $a, $b, SETTRUE) is always folded, so we won't get fcaf:
+  ; CHECK-DAG: ldi.b [[R1:\$w[0-9]+]], -1
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+  ; CHECK: .size true_v4f32
+}
+
+define void @true_v2f64(<2 x i64>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: true_v2f64:
+
+  %1 = load <2 x double>* %a
+  %2 = load <2 x double>* %b
+  %3 = fcmp true <2 x double> %1, %2
+  %4 = sext <2 x i1> %3 to <2 x i64>
+  store <2 x i64> %4, <2 x i64>* %c
+  ret void
+
+  ; FIXME: This code is correct, but poor. Ideally it would be similar to
+  ;        the code in @true_v4f32
+  ; CHECK-DAG: ldi.d [[R1:\$w[0-9]+]], 1
+  ; CHECK-DAG: slli.d [[R3:\$w[0-9]+]], [[R1]], 63
+  ; CHECK-DAG: srai.d [[R4:\$w[0-9]+]], [[R3]], 63
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+  ; CHECK: .size true_v2f64
+}
+
+define void @bsel_v4f32(<4 x float>* %d, <4 x float>* %a, <4 x float>* %b,
+                          <4 x float>* %c) nounwind {
+  ; CHECK: bsel_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <4 x float>* %c
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0($7)
+  %4 = fcmp ogt <4 x float> %1, %2
+  ; CHECK-DAG: fclt.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <4 x i1> %4, <4 x float> %1, <4 x float> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <4 x float> %5, <4 x float>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v4f32
+}
+
+define void @bsel_v2f64(<2 x double>* %d, <2 x double>* %a, <2 x double>* %b,
+                          <2 x double>* %c) nounwind {
+  ; CHECK: bsel_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = load <2 x double>* %c
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0($7)
+  %4 = fcmp ogt <2 x double> %1, %2
+  ; CHECK-DAG: fclt.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %5 = select <2 x i1> %4, <2 x double> %1, <2 x double> %3
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3]]
+  store <2 x double> %5, <2 x double>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bsel_v2f64
+}
+
+define void @bseli_v4f32(<4 x float>* %d, <4 x float>* %a, <4 x float>* %b,
+                          <4 x float>* %c) nounwind {
+  ; CHECK: bseli_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ogt <4 x float> %1, %2
+  ; CHECK-DAG: fclt.w [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <4 x i1> %3, <4 x float> %1, <4 x float> zeroinitializer
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3:\$w[0-9]+]]
+  store <4 x float> %4, <4 x float>* %d
+  ; CHECK-DAG: st.w [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_v4f32
+}
+
+define void @bseli_v2f64(<2 x double>* %d, <2 x double>* %a, <2 x double>* %b,
+                          <2 x double>* %c) nounwind {
+  ; CHECK: bseli_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = fcmp ogt <2 x double> %1, %2
+  ; CHECK-DAG: fclt.d [[R4:\$w[0-9]+]], [[R2]], [[R1]]
+  %4 = select <2 x i1> %3, <2 x double> %1, <2 x double> zeroinitializer
+  ; CHECK-DAG: bsel.v [[R4]], [[R1]], [[R3:\$w[0-9]+]]
+  store <2 x double> %4, <2 x double>* %d
+  ; CHECK-DAG: st.d [[R4]], 0($4)
+
+  ret void
+  ; CHECK: .size bseli_v2f64
+}
+
+define void @max_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: max_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = tail call <4 x float> @llvm.mips.fmax.w(<4 x float> %1, <4 x float> %2)
+  ; CHECK-DAG: fmax.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_v4f32
+}
+
+define void @max_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: max_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = tail call <2 x double> @llvm.mips.fmax.d(<2 x double> %1, <2 x double> %2)
+  ; CHECK-DAG: fmax.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size max_v2f64
+}
+
+define void @min_v4f32(<4 x float>* %c, <4 x float>* %a, <4 x float>* %b) nounwind {
+  ; CHECK: min_v4f32:
+
+  %1 = load <4 x float>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x float>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = tail call <4 x float> @llvm.mips.fmin.w(<4 x float> %1, <4 x float> %2)
+  ; CHECK-DAG: fmin.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x float> %3, <4 x float>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_v4f32
+}
+
+define void @min_v2f64(<2 x double>* %c, <2 x double>* %a, <2 x double>* %b) nounwind {
+  ; CHECK: min_v2f64:
+
+  %1 = load <2 x double>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x double>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = tail call <2 x double> @llvm.mips.fmin.d(<2 x double> %1, <2 x double> %2)
+  ; CHECK-DAG: fmin.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x double> %3, <2 x double>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size min_v2f64
+}
diff --git a/test/CodeGen/Mips/msa/elm_copy.ll b/test/CodeGen/Mips/msa/elm_copy.ll
new file mode 100644
index 0000000..ed3e52c
--- /dev/null
+++ b/test/CodeGen/Mips/msa/elm_copy.ll
@@ -0,0 +1,162 @@
+; Test the MSA intrinsics that are encoded with the ELM instruction format and
+; are element extraction operations.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_copy_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_copy_s_b_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_copy_s_b_ARG1
+  %1 = tail call i32 @llvm.mips.copy.s.b(<16 x i8> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_s_b_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_copy_s_b_test:
+; CHECK: ld.b
+; CHECK: copy_s.b
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_s_b_test
+;
+@llvm_mips_copy_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_copy_s_h_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_copy_s_h_ARG1
+  %1 = tail call i32 @llvm.mips.copy.s.h(<8 x i16> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_s_h_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_copy_s_h_test:
+; CHECK: ld.h
+; CHECK: copy_s.h
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_s_h_test
+;
+@llvm_mips_copy_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_copy_s_w_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_copy_s_w_ARG1
+  %1 = tail call i32 @llvm.mips.copy.s.w(<4 x i32> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_s_w_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_copy_s_w_test:
+; CHECK: ld.w
+; CHECK: copy_s.w
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_s_w_test
+;
+@llvm_mips_copy_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_copy_s_d_RES  = global i64 0, align 16
+
+define void @llvm_mips_copy_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_copy_s_d_ARG1
+  %1 = tail call i64 @llvm.mips.copy.s.d(<2 x i64> %0, i32 1)
+  store i64 %1, i64* @llvm_mips_copy_s_d_RES
+  ret void
+}
+
+declare i64 @llvm.mips.copy.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_copy_s_d_test:
+; CHECK: ld.w
+; CHECK: copy_s.w
+; CHECK: copy_s.w
+; CHECK: sw
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_s_d_test
+;
+@llvm_mips_copy_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_copy_u_b_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_copy_u_b_ARG1
+  %1 = tail call i32 @llvm.mips.copy.u.b(<16 x i8> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_u_b_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_copy_u_b_test:
+; CHECK: ld.b
+; CHECK: copy_u.b
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_u_b_test
+;
+@llvm_mips_copy_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_copy_u_h_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_copy_u_h_ARG1
+  %1 = tail call i32 @llvm.mips.copy.u.h(<8 x i16> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_u_h_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_copy_u_h_test:
+; CHECK: ld.h
+; CHECK: copy_u.h
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_u_h_test
+;
+@llvm_mips_copy_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_copy_u_w_RES  = global i32 0, align 16
+
+define void @llvm_mips_copy_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_copy_u_w_ARG1
+  %1 = tail call i32 @llvm.mips.copy.u.w(<4 x i32> %0, i32 1)
+  store i32 %1, i32* @llvm_mips_copy_u_w_RES
+  ret void
+}
+
+declare i32 @llvm.mips.copy.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_copy_u_w_test:
+; CHECK: ld.w
+; CHECK: copy_u.w
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_u_w_test
+;
+@llvm_mips_copy_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_copy_u_d_RES  = global i64 0, align 16
+
+define void @llvm_mips_copy_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_copy_u_d_ARG1
+  %1 = tail call i64 @llvm.mips.copy.u.d(<2 x i64> %0, i32 1)
+  store i64 %1, i64* @llvm_mips_copy_u_d_RES
+  ret void
+}
+
+declare i64 @llvm.mips.copy.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_copy_u_d_test:
+; CHECK: ld.w
+; CHECK: copy_s.w
+; CHECK: copy_s.w
+; CHECK: sw
+; CHECK: sw
+; CHECK: .size llvm_mips_copy_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/elm_cxcmsa.ll b/test/CodeGen/Mips/msa/elm_cxcmsa.ll
new file mode 100644
index 0000000..8d6b0ee
--- /dev/null
+++ b/test/CodeGen/Mips/msa/elm_cxcmsa.ll
@@ -0,0 +1,168 @@
+; Test the MSA ctcmsa and cfcmsa intrinsics (which are encoded with the ELM
+; instruction format).
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define i32 @msa_ir_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 0)
+  ret i32 %0
+}
+
+; CHECK: msa_ir_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $0
+; CHECK: .size msa_ir_cfcmsa_test
+;
+define i32 @msa_csr_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 1)
+  ret i32 %0
+}
+
+; CHECK: msa_csr_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $1
+; CHECK: .size msa_csr_cfcmsa_test
+;
+define i32 @msa_access_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 2)
+  ret i32 %0
+}
+
+; CHECK: msa_access_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $2
+; CHECK: .size msa_access_cfcmsa_test
+;
+define i32 @msa_save_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 3)
+  ret i32 %0
+}
+
+; CHECK: msa_save_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $3
+; CHECK: .size msa_save_cfcmsa_test
+;
+define i32 @msa_modify_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 4)
+  ret i32 %0
+}
+
+; CHECK: msa_modify_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $4
+; CHECK: .size msa_modify_cfcmsa_test
+;
+define i32 @msa_request_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 5)
+  ret i32 %0
+}
+
+; CHECK: msa_request_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $5
+; CHECK: .size msa_request_cfcmsa_test
+;
+define i32 @msa_map_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 6)
+  ret i32 %0
+}
+
+; CHECK: msa_map_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $6
+; CHECK: .size msa_map_cfcmsa_test
+;
+define i32 @msa_unmap_cfcmsa_test() nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.cfcmsa(i32 7)
+  ret i32 %0
+}
+
+; CHECK: msa_unmap_cfcmsa_test:
+; CHECK: cfcmsa $[[R1:[0-9]+]], $7
+; CHECK: .size msa_unmap_cfcmsa_test
+;
+define void @msa_ir_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 0, i32 1)
+  ret void
+}
+
+; CHECK: msa_ir_ctcmsa_test:
+; CHECK: ctcmsa $0
+; CHECK: .size msa_ir_ctcmsa_test
+;
+define void @msa_csr_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 1, i32 1)
+  ret void
+}
+
+; CHECK: msa_csr_ctcmsa_test:
+; CHECK: ctcmsa $1
+; CHECK: .size msa_csr_ctcmsa_test
+;
+define void @msa_access_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 2, i32 1)
+  ret void
+}
+
+; CHECK: msa_access_ctcmsa_test:
+; CHECK: ctcmsa $2
+; CHECK: .size msa_access_ctcmsa_test
+;
+define void @msa_save_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 3, i32 1)
+  ret void
+}
+
+; CHECK: msa_save_ctcmsa_test:
+; CHECK: ctcmsa $3
+; CHECK: .size msa_save_ctcmsa_test
+;
+define void @msa_modify_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 4, i32 1)
+  ret void
+}
+
+; CHECK: msa_modify_ctcmsa_test:
+; CHECK: ctcmsa $4
+; CHECK: .size msa_modify_ctcmsa_test
+;
+define void @msa_request_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 5, i32 1)
+  ret void
+}
+
+; CHECK: msa_request_ctcmsa_test:
+; CHECK: ctcmsa $5
+; CHECK: .size msa_request_ctcmsa_test
+;
+define void @msa_map_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 6, i32 1)
+  ret void
+}
+
+; CHECK: msa_map_ctcmsa_test:
+; CHECK: ctcmsa $6
+; CHECK: .size msa_map_ctcmsa_test
+;
+define void @msa_unmap_ctcmsa_test() nounwind {
+entry:
+  tail call void @llvm.mips.ctcmsa(i32 7, i32 1)
+  ret void
+}
+
+; CHECK: msa_unmap_ctcmsa_test:
+; CHECK: ctcmsa $7
+; CHECK: .size msa_unmap_ctcmsa_test
+;
+declare i32 @llvm.mips.cfcmsa(i32) nounwind
+declare void @llvm.mips.ctcmsa(i32, i32) nounwind
diff --git a/test/CodeGen/Mips/msa/elm_insv.ll b/test/CodeGen/Mips/msa/elm_insv.ll
new file mode 100644
index 0000000..fa7ceaf
--- /dev/null
+++ b/test/CodeGen/Mips/msa/elm_insv.ll
@@ -0,0 +1,192 @@
+; Test the MSA element insertion intrinsics that are encoded with the ELM
+; instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_insert_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_insert_b_ARG3 = global i32 27, align 16
+@llvm_mips_insert_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_insert_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_insert_b_ARG1
+  %1 = load i32* @llvm_mips_insert_b_ARG3
+  %2 = tail call <16 x i8> @llvm.mips.insert.b(<16 x i8> %0, i32 1, i32 %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_insert_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.insert.b(<16 x i8>, i32, i32) nounwind
+
+; CHECK: llvm_mips_insert_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
+; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0(
+; CHECK-DAG: insert.b [[R2]][1], [[R1]]
+; CHECK-DAG: st.b [[R2]], 0(
+; CHECK: .size llvm_mips_insert_b_test
+;
+@llvm_mips_insert_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_insert_h_ARG3 = global i32 27, align 16
+@llvm_mips_insert_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_insert_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_insert_h_ARG1
+  %1 = load i32* @llvm_mips_insert_h_ARG3
+  %2 = tail call <8 x i16> @llvm.mips.insert.h(<8 x i16> %0, i32 1, i32 %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_insert_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.insert.h(<8 x i16>, i32, i32) nounwind
+
+; CHECK: llvm_mips_insert_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
+; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0(
+; CHECK-DAG: insert.h [[R2]][1], [[R1]]
+; CHECK-DAG: st.h [[R2]], 0(
+; CHECK: .size llvm_mips_insert_h_test
+;
+@llvm_mips_insert_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_insert_w_ARG3 = global i32 27, align 16
+@llvm_mips_insert_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_insert_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_insert_w_ARG1
+  %1 = load i32* @llvm_mips_insert_w_ARG3
+  %2 = tail call <4 x i32> @llvm.mips.insert.w(<4 x i32> %0, i32 1, i32 %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_insert_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.insert.w(<4 x i32>, i32, i32) nounwind
+
+; CHECK: llvm_mips_insert_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
+; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0(
+; CHECK-DAG: insert.w [[R2]][1], [[R1]]
+; CHECK-DAG: st.w [[R2]], 0(
+; CHECK: .size llvm_mips_insert_w_test
+;
+@llvm_mips_insert_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_insert_d_ARG3 = global i64 27, align 16
+@llvm_mips_insert_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_insert_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_insert_d_ARG1
+  %1 = load i64* @llvm_mips_insert_d_ARG3
+  %2 = tail call <2 x i64> @llvm.mips.insert.d(<2 x i64> %0, i32 1, i64 %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_insert_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.insert.d(<2 x i64>, i32, i64) nounwind
+
+; CHECK: llvm_mips_insert_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], 0(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], 4(
+; CHECK-DAG: ld.w [[R3:\$w[0-9]+]],
+; CHECK-DAG: insert.w [[R3]][2], [[R1]]
+; CHECK-DAG: insert.w [[R3]][3], [[R2]]
+; CHECK-DAG: st.w [[R3]],
+; CHECK: .size llvm_mips_insert_d_test
+;
+@llvm_mips_insve_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_insve_b_ARG3 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_insve_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_insve_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_insve_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_insve_b_ARG3
+  %2 = tail call <16 x i8> @llvm.mips.insve.b(<16 x i8> %0, i32 1, <16 x i8> %1)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_insve_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.insve.b(<16 x i8>, i32, <16 x i8>) nounwind
+
+; CHECK: llvm_mips_insve_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_b_ARG3)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: insve.b [[R3]][1], [[R4]][0]
+; CHECK-DAG: st.b [[R3]],
+; CHECK: .size llvm_mips_insve_b_test
+;
+@llvm_mips_insve_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_insve_h_ARG3 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_insve_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_insve_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_insve_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_insve_h_ARG3
+  %2 = tail call <8 x i16> @llvm.mips.insve.h(<8 x i16> %0, i32 1, <8 x i16> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_insve_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.insve.h(<8 x i16>, i32, <8 x i16>) nounwind
+
+; CHECK: llvm_mips_insve_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_h_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_h_ARG3)(
+; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: insve.h [[R3]][1], [[R4]][0]
+; CHECK-DAG: st.h [[R3]],
+; CHECK: .size llvm_mips_insve_h_test
+;
+@llvm_mips_insve_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_insve_w_ARG3 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_insve_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_insve_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_insve_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_insve_w_ARG3
+  %2 = tail call <4 x i32> @llvm.mips.insve.w(<4 x i32> %0, i32 1, <4 x i32> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_insve_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.insve.w(<4 x i32>, i32, <4 x i32>) nounwind
+
+; CHECK: llvm_mips_insve_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_w_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_w_ARG3)(
+; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: insve.w [[R3]][1], [[R4]][0]
+; CHECK-DAG: st.w [[R3]],
+; CHECK: .size llvm_mips_insve_w_test
+;
+@llvm_mips_insve_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_insve_d_ARG3 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_insve_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_insve_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_insve_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_insve_d_ARG3
+  %2 = tail call <2 x i64> @llvm.mips.insve.d(<2 x i64> %0, i32 1, <2 x i64> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_insve_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.insve.d(<2 x i64>, i32, <2 x i64>) nounwind
+
+; CHECK: llvm_mips_insve_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_insve_d_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_insve_d_ARG3)(
+; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: insve.d [[R3]][1], [[R4]][0]
+; CHECK-DAG: st.d [[R3]],
+; CHECK: .size llvm_mips_insve_d_test
+;
diff --git a/test/CodeGen/Mips/msa/elm_move.ll b/test/CodeGen/Mips/msa/elm_move.ll
new file mode 100644
index 0000000..98c06c7
--- /dev/null
+++ b/test/CodeGen/Mips/msa/elm_move.ll
@@ -0,0 +1,25 @@
+; Test the MSA move intrinsics (which are encoded with the ELM instruction
+; format).
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_move_vb_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_move_vb_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_move_vb_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_move_vb_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.move.v(<16 x i8> %0)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_move_vb_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.move.v(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_move_vb_test:
+; CHECK: ld.b
+; CHECK: move.v
+; CHECK: st.b
+; CHECK: .size llvm_mips_move_vb_test
+;
diff --git a/test/CodeGen/Mips/msa/elm_shift_slide.ll b/test/CodeGen/Mips/msa/elm_shift_slide.ll
new file mode 100644
index 0000000..39d670d
--- /dev/null
+++ b/test/CodeGen/Mips/msa/elm_shift_slide.ll
@@ -0,0 +1,158 @@
+; Test the MSA intrinsics that are encoded with the ELM instruction format and
+; are either shifts or slides.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_sldi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_sldi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_sldi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_sldi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.sldi.b(<16 x i8> %0, i32 1)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_sldi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.sldi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_sldi_b_test:
+; CHECK: ld.b
+; CHECK: sldi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_sldi_b_test
+;
+@llvm_mips_sldi_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_sldi_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_sldi_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_sldi_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.sldi.h(<8 x i16> %0, i32 1)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_sldi_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.sldi.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_sldi_h_test:
+; CHECK: ld.h
+; CHECK: sldi.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_sldi_h_test
+;
+@llvm_mips_sldi_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_sldi_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_sldi_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_sldi_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.sldi.w(<4 x i32> %0, i32 1)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_sldi_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.sldi.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_sldi_w_test:
+; CHECK: ld.w
+; CHECK: sldi.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_sldi_w_test
+;
+@llvm_mips_sldi_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_sldi_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_sldi_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_sldi_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.sldi.d(<2 x i64> %0, i32 1)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_sldi_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.sldi.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_sldi_d_test:
+; CHECK: ld.d
+; CHECK: sldi.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_sldi_d_test
+;
+@llvm_mips_splati_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_splati_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_splati_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_splati_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.splati.b(<16 x i8> %0, i32 1)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_splati_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.splati.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_splati_b_test:
+; CHECK: ld.b
+; CHECK: splati.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_splati_b_test
+;
+@llvm_mips_splati_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_splati_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_splati_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_splati_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.splati.h(<8 x i16> %0, i32 1)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_splati_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.splati.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_splati_h_test:
+; CHECK: ld.h
+; CHECK: splati.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_splati_h_test
+;
+@llvm_mips_splati_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_splati_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_splati_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_splati_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.splati.w(<4 x i32> %0, i32 1)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_splati_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.splati.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_splati_w_test:
+; CHECK: ld.w
+; CHECK: splati.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_splati_w_test
+;
+@llvm_mips_splati_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_splati_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_splati_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_splati_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.splati.d(<2 x i64> %0, i32 1)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_splati_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.splati.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_splati_d_test:
+; CHECK: ld.d
+; CHECK: splati.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_splati_d_test
+;
diff --git a/test/CodeGen/Mips/msa/endian.ll b/test/CodeGen/Mips/msa/endian.ll
new file mode 100644
index 0000000..44d1925
--- /dev/null
+++ b/test/CodeGen/Mips/msa/endian.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=BIGENDIAN %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=LITENDIAN %s
+
+@v16i8 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+@v8i16 = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+@v4i32 = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+@v2i64 = global <2 x i64> <i64 0, i64 0>
+
+define void @const_v16i8() nounwind {
+  ; LITENDIAN: .byte 0
+  ; LITENDIAN: .byte 1
+  ; LITENDIAN: .byte 2
+  ; LITENDIAN: .byte 3
+  ; LITENDIAN: .byte 4
+  ; LITENDIAN: .byte 5
+  ; LITENDIAN: .byte 6
+  ; LITENDIAN: .byte 7
+  ; LITENDIAN: .byte 8
+  ; LITENDIAN: .byte 9
+  ; LITENDIAN: .byte 10
+  ; LITENDIAN: .byte 11
+  ; LITENDIAN: .byte 12
+  ; LITENDIAN: .byte 13
+  ; LITENDIAN: .byte 14
+  ; LITENDIAN: .byte 15
+  ; LITENDIAN: const_v16i8:
+  ; BIGENDIAN: .byte 0
+  ; BIGENDIAN: .byte 1
+  ; BIGENDIAN: .byte 2
+  ; BIGENDIAN: .byte 3
+  ; BIGENDIAN: .byte 4
+  ; BIGENDIAN: .byte 5
+  ; BIGENDIAN: .byte 6
+  ; BIGENDIAN: .byte 7
+  ; BIGENDIAN: .byte 8
+  ; BIGENDIAN: .byte 9
+  ; BIGENDIAN: .byte 10
+  ; BIGENDIAN: .byte 11
+  ; BIGENDIAN: .byte 12
+  ; BIGENDIAN: .byte 13
+  ; BIGENDIAN: .byte 14
+  ; BIGENDIAN: .byte 15
+  ; BIGENDIAN: const_v16i8:
+
+  store volatile <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, <16 x i8>*@v16i8
+
+  ret void
+}
+
+define void @const_v8i16() nounwind {
+  ; LITENDIAN: .2byte 0
+  ; LITENDIAN: .2byte 1
+  ; LITENDIAN: .2byte 2
+  ; LITENDIAN: .2byte 3
+  ; LITENDIAN: .2byte 4
+  ; LITENDIAN: .2byte 5
+  ; LITENDIAN: .2byte 6
+  ; LITENDIAN: .2byte 7
+  ; LITENDIAN: const_v8i16:
+  ; BIGENDIAN: .2byte 0
+  ; BIGENDIAN: .2byte 1
+  ; BIGENDIAN: .2byte 2
+  ; BIGENDIAN: .2byte 3
+  ; BIGENDIAN: .2byte 4
+  ; BIGENDIAN: .2byte 5
+  ; BIGENDIAN: .2byte 6
+  ; BIGENDIAN: .2byte 7
+  ; BIGENDIAN: const_v8i16:
+
+  store volatile <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, <8 x i16>*@v8i16
+
+  ret void
+}
+
+define void @const_v4i32() nounwind {
+  ; LITENDIAN: .4byte 0
+  ; LITENDIAN: .4byte 1
+  ; LITENDIAN: .4byte 2
+  ; LITENDIAN: .4byte 3
+  ; LITENDIAN: const_v4i32:
+  ; BIGENDIAN: .4byte 0
+  ; BIGENDIAN: .4byte 1
+  ; BIGENDIAN: .4byte 2
+  ; BIGENDIAN: .4byte 3
+  ; BIGENDIAN: const_v4i32:
+
+  store volatile <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>*@v4i32
+
+  ret void
+}
+
+define void @const_v2i64() nounwind {
+  ; LITENDIAN: .4byte 1
+  ; LITENDIAN: .4byte 0
+  ; LITENDIAN: .4byte 2
+  ; LITENDIAN: .4byte 0
+  ; LITENDIAN: const_v2i64:
+  ; BIGENDIAN: .4byte 0
+  ; BIGENDIAN: .4byte 1
+  ; BIGENDIAN: .4byte 0
+  ; BIGENDIAN: .4byte 2
+  ; BIGENDIAN: const_v2i64:
+
+  store volatile <2 x i64> <i64 1, i64 2>, <2 x i64>*@v2i64
+
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/frameindex.ll b/test/CodeGen/Mips/msa/frameindex.ll
new file mode 100644
index 0000000..3088e1b
--- /dev/null
+++ b/test/CodeGen/Mips/msa/frameindex.ll
@@ -0,0 +1,85 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32-AE -check-prefix=MIPS32-BE %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=MIPS32-AE -check-prefix=MIPS32-LE %s
+
+define void @loadstore_v16i8_near() nounwind {
+  ; MIPS32-AE: loadstore_v16i8_near:
+
+  %1 = alloca <16 x i8>
+  %2 = load volatile <16 x i8>* %1
+  ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0($sp)
+  store volatile <16 x i8> %2, <16 x i8>* %1
+  ; MIPS32-AE: st.b [[R1]], 0($sp)
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v16i8_near
+}
+
+define void @loadstore_v16i8_just_under_simm10() nounwind {
+  ; MIPS32-AE: loadstore_v16i8_just_under_simm10:
+
+  %1 = alloca <16 x i8>
+  %2 = alloca [496 x i8] ; Push the frame right up to 512 bytes
+
+  %3 = load volatile <16 x i8>* %1
+  ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 496($sp)
+  store volatile <16 x i8> %3, <16 x i8>* %1
+  ; MIPS32-AE: st.b [[R1]], 496($sp)
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v16i8_just_under_simm10
+}
+
+define void @loadstore_v16i8_just_over_simm10() nounwind {
+  ; MIPS32-AE: loadstore_v16i8_just_over_simm10:
+
+  %1 = alloca <16 x i8>
+  %2 = alloca [497 x i8] ; Push the frame just over 512 bytes
+
+  %3 = load volatile <16 x i8>* %1
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 512
+  ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <16 x i8> %3, <16 x i8>* %1
+  ; MIPS32-AE: addiu [[BASE:\$[0-9]+]], $sp, 512
+  ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v16i8_just_over_simm10
+}
+
+define void @loadstore_v16i8_just_under_simm16() nounwind {
+  ; MIPS32-AE: loadstore_v16i8_just_under_simm16:
+
+  %1 = alloca <16 x i8>
+  %2 = alloca [32752 x i8] ; Push the frame right up to 32768 bytes
+
+  %3 = load volatile <16 x i8>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <16 x i8> %3, <16 x i8>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v16i8_just_under_simm16
+}
+
+define void @loadstore_v16i8_just_over_simm16() nounwind {
+  ; MIPS32-AE: loadstore_v16i8_just_over_simm16:
+
+  %1 = alloca <16 x i8>
+  %2 = alloca [32753 x i8] ; Push the frame just over 32768 bytes
+
+  %3 = load volatile <16 x i8>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: ld.b [[R1:\$w[0-9]+]], 0([[BASE]])
+  store volatile <16 x i8> %3, <16 x i8>* %1
+  ; MIPS32-AE: ori [[R2:\$[0-9]+]], $zero, 32768
+  ; MIPS32-AE: addu [[BASE:\$[0-9]+]], $sp, [[R2]]
+  ; MIPS32-AE: st.b [[R1]], 0([[BASE]])
+
+  ret void
+  ; MIPS32-AE: .size loadstore_v16i8_just_over_simm16
+}
diff --git a/test/CodeGen/Mips/msa/i10.ll b/test/CodeGen/Mips/msa/i10.ll
new file mode 100644
index 0000000..c5a9617
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i10.ll
@@ -0,0 +1,89 @@
+; Test the MSA intrinsics that are encoded with the I10 instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_bnz_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+
+define i32 @llvm_mips_bnz_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bnz_b_ARG1
+  %1 = tail call i32 @llvm.mips.bnz.b(<16 x i8> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bnz.b(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_bnz_b_test:
+; CHECK-DAG: ld.b [[R0:\$w[0-9]+]]
+; CHECK-DAG: bnz.b [[R0]]
+; CHECK: .size llvm_mips_bnz_b_test
+
+@llvm_mips_bnz_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+
+define i32 @llvm_mips_bnz_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bnz_h_ARG1
+  %1 = tail call i32 @llvm.mips.bnz.h(<8 x i16> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bnz.h(<8 x i16>) nounwind
+
+; CHECK: llvm_mips_bnz_h_test:
+; CHECK-DAG: ld.h [[R0:\$w[0-9]+]]
+; CHECK-DAG: bnz.h [[R0]]
+; CHECK: .size llvm_mips_bnz_h_test
+
+@llvm_mips_bnz_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+
+define i32 @llvm_mips_bnz_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bnz_w_ARG1
+  %1 = tail call i32 @llvm.mips.bnz.w(<4 x i32> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bnz.w(<4 x i32>) nounwind
+
+; CHECK: llvm_mips_bnz_w_test:
+; CHECK-DAG: ld.w [[R0:\$w[0-9]+]]
+; CHECK-DAG: bnz.w [[R0]]
+; CHECK: .size llvm_mips_bnz_w_test
+
+@llvm_mips_bnz_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+
+define i32 @llvm_mips_bnz_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bnz_d_ARG1
+  %1 = tail call i32 @llvm.mips.bnz.d(<2 x i64> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bnz.d(<2 x i64>) nounwind
+
+; CHECK: llvm_mips_bnz_d_test:
+; CHECK-DAG: ld.d [[R0:\$w[0-9]+]]
+; CHECK-DAG: bnz.d [[R0]]
+; CHECK: .size llvm_mips_bnz_d_test
+
diff --git a/test/CodeGen/Mips/msa/i5-a.ll b/test/CodeGen/Mips/msa/i5-a.ll
new file mode 100644
index 0000000..0b50720
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5-a.ll
@@ -0,0 +1,82 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format.
+; There are lots of these so this covers those beginning with 'a'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_addvi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_addvi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_addvi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_addvi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.addvi.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_addvi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.addvi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_addvi_b_test:
+; CHECK: ld.b
+; CHECK: addvi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_addvi_b_test
+;
+@llvm_mips_addvi_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_addvi_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_addvi_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_addvi_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.addvi.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_addvi_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.addvi.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_addvi_h_test:
+; CHECK: ld.h
+; CHECK: addvi.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_addvi_h_test
+;
+@llvm_mips_addvi_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_addvi_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_addvi_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_addvi_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.addvi.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_addvi_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.addvi.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_addvi_w_test:
+; CHECK: ld.w
+; CHECK: addvi.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_addvi_w_test
+;
+@llvm_mips_addvi_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_addvi_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_addvi_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_addvi_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.addvi.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_addvi_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.addvi.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_addvi_d_test:
+; CHECK: ld.d
+; CHECK: addvi.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_addvi_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i5-b.ll b/test/CodeGen/Mips/msa/i5-b.ll
new file mode 100644
index 0000000..da6be66
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5-b.ll
@@ -0,0 +1,439 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format.
+; There are lots of these so this covers those beginning with 'b'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_bclri_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bclri_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bclri_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bclri_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.bclri.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_bclri_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bclri.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bclri_b_test:
+; CHECK: ld.b
+; andi.b is equivalent to bclri.b
+; CHECK: andi.b {{\$w[0-9]}}, {{\$w[0-9]}}, 127
+; CHECK: st.b
+; CHECK: .size llvm_mips_bclri_b_test
+;
+@llvm_mips_bclri_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bclri_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bclri_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bclri_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.bclri.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_bclri_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bclri.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_bclri_h_test:
+; CHECK: ld.h
+; CHECK: bclri.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bclri_h_test
+;
+@llvm_mips_bclri_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bclri_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bclri_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bclri_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.bclri.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_bclri_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bclri.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_bclri_w_test:
+; CHECK: ld.w
+; CHECK: bclri.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bclri_w_test
+;
+@llvm_mips_bclri_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bclri_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bclri_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bclri_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_bclri_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bclri.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_bclri_d_test:
+; CHECK: ld.d
+; CHECK: bclri.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bclri_d_test
+;
+@llvm_mips_binsli_b_ARG1 = global <16 x i8> zeroinitializer, align 16
+@llvm_mips_binsli_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsli_b_RES  = global <16 x i8> zeroinitializer, align 16
+
+define void @llvm_mips_binsli_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_binsli_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_binsli_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %0, <16 x i8> %1, i32 7)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_binsli_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.binsli.b(<16 x i8>, <16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_binsli_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsli_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsli_b_ARG2)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsli.b [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsli_b_RES)(
+; CHECK-DAG: st.b [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsli_b_test
+
+@llvm_mips_binsli_h_ARG1 = global <8 x i16> zeroinitializer, align 16
+@llvm_mips_binsli_h_ARG2 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsli_h_RES  = global <8 x i16> zeroinitializer, align 16
+
+define void @llvm_mips_binsli_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_binsli_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_binsli_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.binsli.h(<8 x i16> %0, <8 x i16> %1, i32 7)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_binsli_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.binsli.h(<8 x i16>, <8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_binsli_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsli_h_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsli_h_ARG2)(
+; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsli.h [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsli_h_RES)(
+; CHECK-DAG: st.h [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsli_h_test
+
+@llvm_mips_binsli_w_ARG1 = global <4 x i32> zeroinitializer, align 16
+@llvm_mips_binsli_w_ARG2 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsli_w_RES  = global <4 x i32> zeroinitializer, align 16
+
+define void @llvm_mips_binsli_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_binsli_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_binsli_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.binsli.w(<4 x i32> %0, <4 x i32> %1, i32 7)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_binsli_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.binsli.w(<4 x i32>, <4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_binsli_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsli_w_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsli_w_ARG2)(
+; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsli.w [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsli_w_RES)(
+; CHECK-DAG: st.w [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsli_w_test
+
+@llvm_mips_binsli_d_ARG1 = global <2 x i64> zeroinitializer, align 16
+@llvm_mips_binsli_d_ARG2 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsli_d_RES  = global <2 x i64> zeroinitializer, align 16
+
+define void @llvm_mips_binsli_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_binsli_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_binsli_d_ARG2
+  ; TODO: We use a particularly wide mask here to work around a legalization
+  ;       issue. If the mask doesn't fit within a 10-bit immediate, it gets
+  ;       legalized into a constant pool. We should add a test to cover the
+  ;       other cases once they correctly select binsli.d.
+  %2 = tail call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %0, <2 x i64> %1, i32 61)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_binsli_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.binsli.d(<2 x i64>, <2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_binsli_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsli_d_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsli_d_ARG2)(
+; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsli.d [[R3]], [[R4]], 61
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsli_d_RES)(
+; CHECK-DAG: st.d [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsli_d_test
+
+@llvm_mips_binsri_b_ARG1 = global <16 x i8> zeroinitializer, align 16
+@llvm_mips_binsri_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_binsri_b_RES  = global <16 x i8> zeroinitializer, align 16
+
+define void @llvm_mips_binsri_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_binsri_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_binsri_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %0, <16 x i8> %1, i32 7)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_binsri_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.binsri.b(<16 x i8>, <16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_binsri_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsri_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsri_b_ARG2)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsri.b [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsri_b_RES)(
+; CHECK-DAG: st.b [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsri_b_test
+
+@llvm_mips_binsri_h_ARG1 = global <8 x i16> zeroinitializer, align 16
+@llvm_mips_binsri_h_ARG2 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_binsri_h_RES  = global <8 x i16> zeroinitializer, align 16
+
+define void @llvm_mips_binsri_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_binsri_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_binsri_h_ARG2
+  %2 = tail call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %0, <8 x i16> %1, i32 7)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_binsri_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.binsri.h(<8 x i16>, <8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_binsri_h_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsri_h_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsri_h_ARG2)(
+; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.h [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsri.h [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsri_h_RES)(
+; CHECK-DAG: st.h [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsri_h_test
+
+@llvm_mips_binsri_w_ARG1 = global <4 x i32> zeroinitializer, align 16
+@llvm_mips_binsri_w_ARG2 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_binsri_w_RES  = global <4 x i32> zeroinitializer, align 16
+
+define void @llvm_mips_binsri_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_binsri_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_binsri_w_ARG2
+  %2 = tail call <4 x i32> @llvm.mips.binsri.w(<4 x i32> %0, <4 x i32> %1, i32 7)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_binsri_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.binsri.w(<4 x i32>, <4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_binsri_w_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsri_w_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsri_w_ARG2)(
+; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.w [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsri.w [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsri_w_RES)(
+; CHECK-DAG: st.w [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsri_w_test
+
+@llvm_mips_binsri_d_ARG1 = global <2 x i64> zeroinitializer, align 16
+@llvm_mips_binsri_d_ARG2 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_binsri_d_RES  = global <2 x i64> zeroinitializer, align 16
+
+define void @llvm_mips_binsri_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_binsri_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_binsri_d_ARG2
+  %2 = tail call <2 x i64> @llvm.mips.binsri.d(<2 x i64> %0, <2 x i64> %1, i32 7)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_binsri_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.binsri.d(<2 x i64>, <2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_binsri_d_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_binsri_d_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsri_d_ARG2)(
+; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.d [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: binsri.d [[R3]], [[R4]], 7
+; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsri_d_RES)(
+; CHECK-DAG: st.d [[R3]], 0([[R5]])
+; CHECK: .size llvm_mips_binsri_d_test
+
+@llvm_mips_bnegi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bnegi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bnegi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bnegi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.bnegi.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_bnegi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bnegi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bnegi_b_test:
+; CHECK: ld.b
+; CHECK: bnegi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_bnegi_b_test
+;
+@llvm_mips_bnegi_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bnegi_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bnegi_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bnegi_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.bnegi.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_bnegi_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bnegi.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_bnegi_h_test:
+; CHECK: ld.h
+; CHECK: bnegi.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bnegi_h_test
+;
+@llvm_mips_bnegi_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bnegi_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bnegi_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bnegi_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.bnegi.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_bnegi_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bnegi.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_bnegi_w_test:
+; CHECK: ld.w
+; CHECK: bnegi.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bnegi_w_test
+;
+@llvm_mips_bnegi_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bnegi_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bnegi_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bnegi_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_bnegi_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bnegi.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_bnegi_d_test:
+; CHECK: ld.d
+; CHECK: bnegi.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bnegi_d_test
+;
+@llvm_mips_bseti_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bseti_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bseti_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bseti_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.bseti.b(<16 x i8> %0, i32 7)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_bseti_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bseti.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bseti_b_test:
+; CHECK: ld.b
+; CHECK: bseti.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_bseti_b_test
+;
+@llvm_mips_bseti_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bseti_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bseti_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bseti_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.bseti.h(<8 x i16> %0, i32 7)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_bseti_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.bseti.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_bseti_h_test:
+; CHECK: ld.h
+; CHECK: bseti.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_bseti_h_test
+;
+@llvm_mips_bseti_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bseti_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bseti_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bseti_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.bseti.w(<4 x i32> %0, i32 7)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_bseti_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.bseti.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_bseti_w_test:
+; CHECK: ld.w
+; CHECK: bseti.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_bseti_w_test
+;
+@llvm_mips_bseti_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bseti_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bseti_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bseti_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %0, i32 7)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_bseti_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.bseti.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_bseti_d_test:
+; CHECK: ld.d
+; CHECK: bseti.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_bseti_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i5-c.ll b/test/CodeGen/Mips/msa/i5-c.ll
new file mode 100644
index 0000000..bf1578f
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5-c.ll
@@ -0,0 +1,386 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format.
+; There are lots of these so this covers those beginning with 'c'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ceqi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ceqi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ceqi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ceqi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.ceqi.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_ceqi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ceqi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_ceqi_b_test:
+; CHECK: ld.b
+; CHECK: ceqi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ceqi_b_test
+;
+@llvm_mips_ceqi_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ceqi_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ceqi_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_ceqi_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.ceqi.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_ceqi_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ceqi.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_ceqi_h_test:
+; CHECK: ld.h
+; CHECK: ceqi.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_ceqi_h_test
+;
+@llvm_mips_ceqi_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ceqi_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ceqi_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_ceqi_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.ceqi.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ceqi_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ceqi.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_ceqi_w_test:
+; CHECK: ld.w
+; CHECK: ceqi.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_ceqi_w_test
+;
+@llvm_mips_ceqi_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ceqi_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ceqi_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_ceqi_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.ceqi.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ceqi_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ceqi.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_ceqi_d_test:
+; CHECK: ld.d
+; CHECK: ceqi.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_ceqi_d_test
+;
+@llvm_mips_clei_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clei_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clei_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clei_s_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.clei.s.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_clei_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clei.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_clei_s_b_test:
+; CHECK: ld.b
+; CHECK: clei_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clei_s_b_test
+;
+@llvm_mips_clei_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clei_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clei_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clei_s_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.clei.s.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_clei_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clei.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_clei_s_h_test:
+; CHECK: ld.h
+; CHECK: clei_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clei_s_h_test
+;
+@llvm_mips_clei_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clei_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clei_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clei_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.clei.s.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_clei_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clei.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_clei_s_w_test:
+; CHECK: ld.w
+; CHECK: clei_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clei_s_w_test
+;
+@llvm_mips_clei_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clei_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clei_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clei_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.clei.s.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_clei_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clei.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_clei_s_d_test:
+; CHECK: ld.d
+; CHECK: clei_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clei_s_d_test
+;
+@llvm_mips_clei_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clei_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clei_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clei_u_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.clei.u.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_clei_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clei.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_clei_u_b_test:
+; CHECK: ld.b
+; CHECK: clei_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clei_u_b_test
+;
+@llvm_mips_clei_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clei_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clei_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clei_u_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.clei.u.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_clei_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clei.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_clei_u_h_test:
+; CHECK: ld.h
+; CHECK: clei_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clei_u_h_test
+;
+@llvm_mips_clei_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clei_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clei_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clei_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.clei.u.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_clei_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clei.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_clei_u_w_test:
+; CHECK: ld.w
+; CHECK: clei_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clei_u_w_test
+;
+@llvm_mips_clei_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clei_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clei_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clei_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.clei.u.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_clei_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clei.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_clei_u_d_test:
+; CHECK: ld.d
+; CHECK: clei_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clei_u_d_test
+;
+@llvm_mips_clti_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clti_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clti_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clti_s_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.clti.s.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_clti_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clti.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_clti_s_b_test:
+; CHECK: ld.b
+; CHECK: clti_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clti_s_b_test
+;
+@llvm_mips_clti_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clti_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clti_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clti_s_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.clti.s.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_clti_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clti.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_clti_s_h_test:
+; CHECK: ld.h
+; CHECK: clti_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clti_s_h_test
+;
+@llvm_mips_clti_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clti_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clti_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clti_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.clti.s.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_clti_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clti.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_clti_s_w_test:
+; CHECK: ld.w
+; CHECK: clti_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clti_s_w_test
+;
+@llvm_mips_clti_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clti_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clti_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clti_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.clti.s.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_clti_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clti.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_clti_s_d_test:
+; CHECK: ld.d
+; CHECK: clti_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clti_s_d_test
+;
+@llvm_mips_clti_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_clti_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_clti_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_clti_u_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.clti.u.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_clti_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.clti.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_clti_u_b_test:
+; CHECK: ld.b
+; CHECK: clti_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_clti_u_b_test
+;
+@llvm_mips_clti_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_clti_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_clti_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_clti_u_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.clti.u.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_clti_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.clti.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_clti_u_h_test:
+; CHECK: ld.h
+; CHECK: clti_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_clti_u_h_test
+;
+@llvm_mips_clti_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_clti_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_clti_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_clti_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.clti.u.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_clti_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.clti.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_clti_u_w_test:
+; CHECK: ld.w
+; CHECK: clti_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_clti_u_w_test
+;
+@llvm_mips_clti_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_clti_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_clti_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_clti_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.clti.u.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_clti_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.clti.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_clti_u_d_test:
+; CHECK: ld.d
+; CHECK: clti_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_clti_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i5-m.ll b/test/CodeGen/Mips/msa/i5-m.ll
new file mode 100644
index 0000000..2766349
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5-m.ll
@@ -0,0 +1,310 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format.
+; There are lots of these so this covers those beginning with 'm'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_maxi_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_maxi_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_maxi_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_maxi_s_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.maxi.s.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_maxi_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.maxi.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_s_b_test:
+; CHECK: ld.b
+; CHECK: maxi_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_maxi_s_b_test
+;
+@llvm_mips_maxi_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_maxi_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_maxi_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_maxi_s_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.maxi.s.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_maxi_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.maxi.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_s_h_test:
+; CHECK: ld.h
+; CHECK: maxi_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_maxi_s_h_test
+;
+@llvm_mips_maxi_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_maxi_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_maxi_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_maxi_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.maxi.s.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_maxi_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.maxi.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_s_w_test:
+; CHECK: ld.w
+; CHECK: maxi_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_maxi_s_w_test
+;
+@llvm_mips_maxi_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_maxi_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_maxi_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_maxi_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.maxi.s.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_maxi_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.maxi.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_s_d_test:
+; CHECK: ld.d
+; CHECK: maxi_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_maxi_s_d_test
+;
+@llvm_mips_maxi_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_maxi_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_maxi_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_maxi_u_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.maxi.u.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_maxi_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.maxi.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_u_b_test:
+; CHECK: ld.b
+; CHECK: maxi_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_maxi_u_b_test
+;
+@llvm_mips_maxi_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_maxi_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_maxi_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_maxi_u_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.maxi.u.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_maxi_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.maxi.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_u_h_test:
+; CHECK: ld.h
+; CHECK: maxi_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_maxi_u_h_test
+;
+@llvm_mips_maxi_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_maxi_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_maxi_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_maxi_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.maxi.u.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_maxi_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.maxi.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_u_w_test:
+; CHECK: ld.w
+; CHECK: maxi_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_maxi_u_w_test
+;
+@llvm_mips_maxi_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_maxi_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_maxi_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_maxi_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.maxi.u.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_maxi_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.maxi.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_maxi_u_d_test:
+; CHECK: ld.d
+; CHECK: maxi_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_maxi_u_d_test
+;
+@llvm_mips_mini_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_mini_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_mini_s_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mini_s_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.mini.s.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_mini_s_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.mini.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_mini_s_b_test:
+; CHECK: ld.b
+; CHECK: mini_s.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_mini_s_b_test
+;
+@llvm_mips_mini_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mini_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mini_s_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mini_s_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.mini.s.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_mini_s_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mini.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_mini_s_h_test:
+; CHECK: ld.h
+; CHECK: mini_s.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mini_s_h_test
+;
+@llvm_mips_mini_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mini_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mini_s_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mini_s_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.mini.s.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_mini_s_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mini.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_mini_s_w_test:
+; CHECK: ld.w
+; CHECK: mini_s.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mini_s_w_test
+;
+@llvm_mips_mini_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_mini_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_mini_s_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mini_s_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.mini.s.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_mini_s_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.mini.s.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_mini_s_d_test:
+; CHECK: ld.d
+; CHECK: mini_s.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_mini_s_d_test
+;
+@llvm_mips_mini_u_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_mini_u_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_mini_u_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_mini_u_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.mini.u.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_mini_u_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.mini.u.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_mini_u_b_test:
+; CHECK: ld.b
+; CHECK: mini_u.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_mini_u_b_test
+;
+@llvm_mips_mini_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_mini_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_mini_u_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_mini_u_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.mini.u.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_mini_u_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.mini.u.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_mini_u_h_test:
+; CHECK: ld.h
+; CHECK: mini_u.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_mini_u_h_test
+;
+@llvm_mips_mini_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_mini_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_mini_u_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_mini_u_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.mini.u.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_mini_u_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.mini.u.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_mini_u_w_test:
+; CHECK: ld.w
+; CHECK: mini_u.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_mini_u_w_test
+;
+@llvm_mips_mini_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_mini_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_mini_u_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_mini_u_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.mini.u.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_mini_u_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.mini.u.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_mini_u_d_test:
+; CHECK: ld.d
+; CHECK: mini_u.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_mini_u_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i5-s.ll b/test/CodeGen/Mips/msa/i5-s.ll
new file mode 100644
index 0000000..184172f
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5-s.ll
@@ -0,0 +1,82 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format.
+; There are lots of these so this covers those beginning with 's'
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_subvi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_subvi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_subvi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_subvi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.subvi.b(<16 x i8> %0, i32 14)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_subvi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.subvi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_subvi_b_test:
+; CHECK: ld.b
+; CHECK: subvi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_subvi_b_test
+;
+@llvm_mips_subvi_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_subvi_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_subvi_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_subvi_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.subvi.h(<8 x i16> %0, i32 14)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_subvi_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.subvi.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_subvi_h_test:
+; CHECK: ld.h
+; CHECK: subvi.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_subvi_h_test
+;
+@llvm_mips_subvi_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_subvi_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_subvi_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_subvi_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.subvi.w(<4 x i32> %0, i32 14)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_subvi_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.subvi.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_subvi_w_test:
+; CHECK: ld.w
+; CHECK: subvi.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_subvi_w_test
+;
+@llvm_mips_subvi_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_subvi_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_subvi_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_subvi_d_ARG1
+  %1 = tail call <2 x i64> @llvm.mips.subvi.d(<2 x i64> %0, i32 14)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_subvi_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.subvi.d(<2 x i64>, i32) nounwind
+
+; CHECK: llvm_mips_subvi_d_test:
+; CHECK: ld.d
+; CHECK: subvi.d
+; CHECK: st.d
+; CHECK: .size llvm_mips_subvi_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i5_ld_st.ll b/test/CodeGen/Mips/msa/i5_ld_st.ll
new file mode 100644
index 0000000..7cc55f2
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i5_ld_st.ll
@@ -0,0 +1,150 @@
+; Test the MSA intrinsics that are encoded with the I5 instruction format and
+; are loads or stores.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_ld_b_ARG = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ld_b_RES = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ld_b_test() nounwind {
+entry:
+  %0 = bitcast <16 x i8>* @llvm_mips_ld_b_ARG to i8*
+  %1 = tail call <16 x i8> @llvm.mips.ld.b(i8* %0, i32 16)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_ld_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ld.b(i8*, i32) nounwind
+
+; CHECK: llvm_mips_ld_b_test:
+; CHECK: ld.b [[R1:\$w[0-9]+]], 16(
+; CHECK: st.b
+; CHECK: .size llvm_mips_ld_b_test
+;
+@llvm_mips_ld_h_ARG = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_ld_h_RES = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_ld_h_test() nounwind {
+entry:
+  %0 = bitcast <8 x i16>* @llvm_mips_ld_h_ARG to i8*
+  %1 = tail call <8 x i16> @llvm.mips.ld.h(i8* %0, i32 16)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_ld_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.ld.h(i8*, i32) nounwind
+
+; CHECK: llvm_mips_ld_h_test:
+; CHECK: ld.h [[R1:\$w[0-9]+]], 16(
+; CHECK: st.h
+; CHECK: .size llvm_mips_ld_h_test
+;
+@llvm_mips_ld_w_ARG = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_ld_w_RES = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_ld_w_test() nounwind {
+entry:
+  %0 = bitcast <4 x i32>* @llvm_mips_ld_w_ARG to i8*
+  %1 = tail call <4 x i32> @llvm.mips.ld.w(i8* %0, i32 16)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_ld_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.ld.w(i8*, i32) nounwind
+
+; CHECK: llvm_mips_ld_w_test:
+; CHECK: ld.w [[R1:\$w[0-9]+]], 16(
+; CHECK: st.w
+; CHECK: .size llvm_mips_ld_w_test
+;
+@llvm_mips_ld_d_ARG = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_ld_d_RES = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_ld_d_test() nounwind {
+entry:
+  %0 = bitcast <2 x i64>* @llvm_mips_ld_d_ARG to i8*
+  %1 = tail call <2 x i64> @llvm.mips.ld.d(i8* %0, i32 16)
+  store <2 x i64> %1, <2 x i64>* @llvm_mips_ld_d_RES
+  ret void
+}
+
+declare <2 x i64> @llvm.mips.ld.d(i8*, i32) nounwind
+
+; CHECK: llvm_mips_ld_d_test:
+; CHECK: ld.d [[R1:\$w[0-9]+]], 16(
+; CHECK: st.d
+; CHECK: .size llvm_mips_ld_d_test
+;
+@llvm_mips_st_b_ARG = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_st_b_RES = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_st_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_st_b_ARG
+  %1 = bitcast <16 x i8>* @llvm_mips_st_b_RES to i8*
+  tail call void @llvm.mips.st.b(<16 x i8> %0, i8* %1, i32 16)
+  ret void
+}
+
+declare void @llvm.mips.st.b(<16 x i8>, i8*, i32) nounwind
+
+; CHECK: llvm_mips_st_b_test:
+; CHECK: ld.b
+; CHECK: st.b [[R1:\$w[0-9]+]], 16(
+; CHECK: .size llvm_mips_st_b_test
+;
+@llvm_mips_st_h_ARG = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_st_h_RES = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_st_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_st_h_ARG
+  %1 = bitcast <8 x i16>* @llvm_mips_st_h_RES to i8*
+  tail call void @llvm.mips.st.h(<8 x i16> %0, i8* %1, i32 16)
+  ret void
+}
+
+declare void @llvm.mips.st.h(<8 x i16>, i8*, i32) nounwind
+
+; CHECK: llvm_mips_st_h_test:
+; CHECK: ld.h
+; CHECK: st.h [[R1:\$w[0-9]+]], 16(
+; CHECK: .size llvm_mips_st_h_test
+;
+@llvm_mips_st_w_ARG = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_st_w_RES = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_st_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_st_w_ARG
+  %1 = bitcast <4 x i32>* @llvm_mips_st_w_RES to i8*
+  tail call void @llvm.mips.st.w(<4 x i32> %0, i8* %1, i32 16)
+  ret void
+}
+
+declare void @llvm.mips.st.w(<4 x i32>, i8*, i32) nounwind
+
+; CHECK: llvm_mips_st_w_test:
+; CHECK: ld.w
+; CHECK: st.w [[R1:\$w[0-9]+]], 16(
+; CHECK: .size llvm_mips_st_w_test
+;
+@llvm_mips_st_d_ARG = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_st_d_RES = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_st_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_st_d_ARG
+  %1 = bitcast <2 x i64>* @llvm_mips_st_d_RES to i8*
+  tail call void @llvm.mips.st.d(<2 x i64> %0, i8* %1, i32 16)
+  ret void
+}
+
+declare void @llvm.mips.st.d(<2 x i64>, i8*, i32) nounwind
+
+; CHECK: llvm_mips_st_d_test:
+; CHECK: ld.d
+; CHECK: st.d [[R1:\$w[0-9]+]], 16(
+; CHECK: .size llvm_mips_st_d_test
+;
diff --git a/test/CodeGen/Mips/msa/i8.ll b/test/CodeGen/Mips/msa/i8.ll
new file mode 100644
index 0000000..d2931a7
--- /dev/null
+++ b/test/CodeGen/Mips/msa/i8.ll
@@ -0,0 +1,211 @@
+; Test the MSA intrinsics that are encoded with the I8 instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_andi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_andi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_andi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_andi_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.andi.b(<16 x i8> %0, i32 25)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_andi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.andi.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_andi_b_test:
+; CHECK: ld.b
+; CHECK: andi.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_andi_b_test
+
+@llvm_mips_bmnzi_b_ARG1 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+@llvm_mips_bmnzi_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmnzi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bmnzi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bmnzi_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bmnzi_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 25)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bmnzi.b(<16 x i8>, <16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bmnzi_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnzi_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnzi_b_ARG2)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: bmnzi.b [[R3]], [[R4]], 25
+; CHECK-DAG: st.b [[R3]], 0(
+; CHECK: .size llvm_mips_bmnzi_b_test
+
+@llvm_mips_bmzi_b_ARG1 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+@llvm_mips_bmzi_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmzi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bmzi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bmzi_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bmzi_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 25)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bmzi_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bmzi.b(<16 x i8>, <16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bmzi_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmzi_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmzi_b_ARG2)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; bmnzi.b is the same as bmzi.b with ws and wd_in swapped
+; CHECK-DAG: bmnzi.b [[R4]], [[R3]], 25
+; CHECK-DAG: st.b [[R4]], 0(
+; CHECK: .size llvm_mips_bmzi_b_test
+
+@llvm_mips_bseli_b_ARG1 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+@llvm_mips_bseli_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bseli_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bseli_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bseli_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bseli_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bseli.b(<16 x i8> %0, <16 x i8> %1, i32 25)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bseli_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.bseli.b(<16 x i8>, <16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_bseli_b_test:
+; CHECK-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bseli_b_ARG1)(
+; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bseli_b_ARG2)(
+; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
+; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
+; CHECK-DAG: bseli.b [[R3]], [[R4]], 25
+; CHECK-DAG: st.b [[R3]], 0(
+; CHECK: .size llvm_mips_bseli_b_test
+
+@llvm_mips_nori_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_nori_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_nori_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_nori_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.nori.b(<16 x i8> %0, i32 25)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_nori_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.nori.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_nori_b_test:
+; CHECK: ld.b
+; CHECK: nori.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_nori_b_test
+;
+@llvm_mips_ori_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_ori_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_ori_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_ori_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.ori.b(<16 x i8> %0, i32 25)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_ori_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.ori.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_ori_b_test:
+; CHECK: ld.b
+; CHECK: ori.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_ori_b_test
+;
+@llvm_mips_shf_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_shf_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_shf_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_shf_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.shf.b(<16 x i8> %0, i32 25)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_shf_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.shf.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_shf_b_test:
+; CHECK: ld.b
+; CHECK: shf.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_shf_b_test
+;
+@llvm_mips_shf_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_shf_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_shf_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_shf_h_ARG1
+  %1 = tail call <8 x i16> @llvm.mips.shf.h(<8 x i16> %0, i32 25)
+  store <8 x i16> %1, <8 x i16>* @llvm_mips_shf_h_RES
+  ret void
+}
+
+declare <8 x i16> @llvm.mips.shf.h(<8 x i16>, i32) nounwind
+
+; CHECK: llvm_mips_shf_h_test:
+; CHECK: ld.h
+; CHECK: shf.h
+; CHECK: st.h
+; CHECK: .size llvm_mips_shf_h_test
+;
+@llvm_mips_shf_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_shf_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_shf_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_shf_w_ARG1
+  %1 = tail call <4 x i32> @llvm.mips.shf.w(<4 x i32> %0, i32 25)
+  store <4 x i32> %1, <4 x i32>* @llvm_mips_shf_w_RES
+  ret void
+}
+
+declare <4 x i32> @llvm.mips.shf.w(<4 x i32>, i32) nounwind
+
+; CHECK: llvm_mips_shf_w_test:
+; CHECK: ld.w
+; CHECK: shf.w
+; CHECK: st.w
+; CHECK: .size llvm_mips_shf_w_test
+;
+@llvm_mips_xori_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_xori_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_xori_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_xori_b_ARG1
+  %1 = tail call <16 x i8> @llvm.mips.xori.b(<16 x i8> %0, i32 25)
+  store <16 x i8> %1, <16 x i8>* @llvm_mips_xori_b_RES
+  ret void
+}
+
+declare <16 x i8> @llvm.mips.xori.b(<16 x i8>, i32) nounwind
+
+; CHECK: llvm_mips_xori_b_test:
+; CHECK: ld.b
+; CHECK: xori.b
+; CHECK: st.b
+; CHECK: .size llvm_mips_xori_b_test
+;
diff --git a/test/CodeGen/Mips/msa/inline-asm.ll b/test/CodeGen/Mips/msa/inline-asm.ll
new file mode 100644
index 0000000..4a34273
--- /dev/null
+++ b/test/CodeGen/Mips/msa/inline-asm.ll
@@ -0,0 +1,34 @@
+; A basic inline assembly test
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@v4i32_r  = global <4 x i32> zeroinitializer, align 16
+
+define void @test1() nounwind {
+entry:
+  ; CHECK-LABEL: test1:
+  %0 = call <4 x i32> asm "ldi.w ${0:w}, 1", "=f"()
+  ; CHECK: ldi.w $w{{[1-3]?[0-9]}}, 1
+  store <4 x i32> %0, <4 x i32>* @v4i32_r
+  ret void
+}
+
+define void @test2() nounwind {
+entry:
+  ; CHECK-LABEL: test2:
+  %0 = load <4 x i32>* @v4i32_r
+  %1 = call <4 x i32> asm "addvi.w ${0:w}, ${1:w}, 1", "=f,f"(<4 x i32> %0)
+  ; CHECK: addvi.w $w{{[1-3]?[0-9]}}, $w{{[1-3]?[0-9]}}, 1
+  store <4 x i32> %1, <4 x i32>* @v4i32_r
+  ret void
+}
+
+define void @test3() nounwind {
+entry:
+  ; CHECK-LABEL: test3:
+  %0 = load <4 x i32>* @v4i32_r
+  %1 = call <4 x i32> asm sideeffect "addvi.w ${0:w}, ${1:w}, 1", "=f,f,~{$w0}"(<4 x i32> %0)
+  ; CHECK: addvi.w $w{{([1-9]|[1-3][0-9])}}, $w{{([1-9]|[1-3][0-9])}}, 1
+  store <4 x i32> %1, <4 x i32>* @v4i32_r
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s1704963983.ll b/test/CodeGen/Mips/msa/llvm-stress-s1704963983.ll
new file mode 100644
index 0000000..4beaaa9
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s1704963983.ll
@@ -0,0 +1,134 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a
+; "Unexpected illegal type!" assertion.
+; It should at least successfully build.
+
+define void @autogen_SD1704963983(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <4 x double>
+  %A3 = alloca <8 x i64>
+  %A2 = alloca <1 x double>
+  %A1 = alloca double
+  %A = alloca i32
+  %L = load i8* %0
+  store i8 77, i8* %0
+  %E = extractelement <8 x i64> zeroinitializer, i32 2
+  %Shuff = shufflevector <8 x i64> zeroinitializer, <8 x i64> zeroinitializer, <8 x i32> <i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15, i32 1, i32 3>
+  %I = insertelement <8 x i64> zeroinitializer, i64 %E, i32 7
+  %Sl = select i1 false, i8* %0, i8* %0
+  %Cmp = icmp eq i32 434069, 272505
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF78, %BB
+  %L5 = load i8* %Sl
+  store i8 %L, i8* %Sl
+  %E6 = extractelement <8 x i32> zeroinitializer, i32 2
+  %Shuff7 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff, <8 x i32> <i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 undef>
+  %I8 = insertelement <8 x i64> zeroinitializer, i64 %4, i32 7
+  %B = shl <1 x i16> zeroinitializer, zeroinitializer
+  %FC = sitofp <8 x i64> zeroinitializer to <8 x float>
+  %Sl9 = select i1 %Cmp, i8 77, i8 77
+  %Cmp10 = icmp uge <8 x i64> %Shuff, zeroinitializer
+  %L11 = load i8* %0
+  store i8 %Sl9, i8* %0
+  %E12 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff13 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff, <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 undef, i32 3, i32 5, i32 7>
+  %I14 = insertelement <4 x i32> zeroinitializer, i32 %3, i32 3
+  %B15 = udiv <1 x i16> %B, zeroinitializer
+  %Tr = trunc <8 x i64> %Shuff to <8 x i32>
+  %Sl16 = select i1 %Cmp, i8 77, i8 %5
+  %Cmp17 = icmp ult <8 x i1> %Cmp10, %Cmp10
+  %L18 = load i8* %Sl
+  store i8 -1, i8* %Sl
+  %E19 = extractelement <8 x i32> zeroinitializer, i32 3
+  %Shuff20 = shufflevector <8 x float> %FC, <8 x float> %FC, <8 x i32> <i32 6, i32 8, i32 undef, i32 12, i32 14, i32 0, i32 2, i32 undef>
+  %I21 = insertelement <8 x i64> %Shuff13, i64 %E, i32 0
+  %B22 = urem <8 x i64> %Shuff7, %I21
+  %FC23 = sitofp i32 50347 to float
+  %Sl24 = select i1 %Cmp, double 0.000000e+00, double 0.000000e+00
+  %Cmp25 = icmp ugt i32 465489, 47533
+  br i1 %Cmp25, label %CF, label %CF78
+
+CF78:                                             ; preds = %CF
+  %L26 = load i8* %Sl
+  store i32 50347, i32* %A
+  %E27 = extractelement <8 x i1> %Cmp10, i32 2
+  br i1 %E27, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF81, %CF78
+  %Shuff28 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff, <8 x i32> <i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 undef>
+  %I29 = insertelement <1 x i16> zeroinitializer, i16 -1, i32 0
+  %B30 = urem <8 x i32> %Tr, zeroinitializer
+  %Tr31 = trunc i32 0 to i16
+  %Sl32 = select i1 %Cmp, <2 x i1> zeroinitializer, <2 x i1> zeroinitializer
+  %L33 = load i8* %Sl
+  store i8 %L26, i8* %Sl
+  %E34 = extractelement <4 x i32> zeroinitializer, i32 0
+  %Shuff35 = shufflevector <1 x i16> zeroinitializer, <1 x i16> %B, <1 x i32> undef
+  %I36 = insertelement <8 x i64> %Shuff28, i64 %E, i32 7
+  %B37 = srem <1 x i16> %I29, zeroinitializer
+  %FC38 = sitofp <8 x i32> %B30 to <8 x double>
+  %Sl39 = select i1 %Cmp, double 0.000000e+00, double %Sl24
+  %L40 = load i8* %Sl
+  store i8 %Sl16, i8* %Sl
+  %E41 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff42 = shufflevector <8 x i1> %Cmp17, <8 x i1> %Cmp10, <8 x i32> <i32 14, i32 undef, i32 2, i32 4, i32 undef, i32 8, i32 10, i32 12>
+  %I43 = insertelement <4 x i32> zeroinitializer, i32 272505, i32 0
+  %B44 = urem <8 x i32> %B30, %Tr
+  %PC = bitcast i8* %0 to i64*
+  %Sl45 = select i1 %Cmp, <8 x i1> %Cmp10, <8 x i1> %Shuff42
+  %Cmp46 = fcmp ugt float 0xB856238A00000000, 0x47DA795E40000000
+  br i1 %Cmp46, label %CF77, label %CF80
+
+CF80:                                             ; preds = %CF80, %CF77
+  %L47 = load i64* %PC
+  store i8 77, i8* %Sl
+  %E48 = extractelement <8 x i64> zeroinitializer, i32 2
+  %Shuff49 = shufflevector <8 x i64> zeroinitializer, <8 x i64> %Shuff7, <8 x i32> <i32 5, i32 7, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 3>
+  %I50 = insertelement <8 x i64> zeroinitializer, i64 %L47, i32 7
+  %B51 = fdiv float 0x46CC2D8000000000, %FC23
+  %PC52 = bitcast <8 x i64>* %A3 to i64*
+  %Sl53 = select i1 %Cmp, <8 x i64> %Shuff, <8 x i64> %Shuff
+  %Cmp54 = fcmp ole float 0x47DA795E40000000, 0xB856238A00000000
+  br i1 %Cmp54, label %CF80, label %CF81
+
+CF81:                                             ; preds = %CF80
+  %L55 = load i8* %Sl
+  store i8 %Sl16, i8* %Sl
+  %E56 = extractelement <1 x i16> %B, i32 0
+  %Shuff57 = shufflevector <1 x i16> zeroinitializer, <1 x i16> zeroinitializer, <1 x i32> <i32 1>
+  %I58 = insertelement <8 x i64> zeroinitializer, i64 %L47, i32 7
+  %B59 = srem i32 %E19, %E19
+  %Sl60 = select i1 %Cmp, i8 77, i8 77
+  %Cmp61 = icmp ult <1 x i16> zeroinitializer, %B
+  %L62 = load i8* %Sl
+  store i64 %L47, i64* %PC52
+  %E63 = extractelement <4 x i32> %I43, i32 2
+  %Shuff64 = shufflevector <4 x i1> zeroinitializer, <4 x i1> zeroinitializer, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
+  %I65 = insertelement <8 x i64> %B22, i64 %L47, i32 7
+  %B66 = add <8 x i64> %I50, %I65
+  %FC67 = uitofp i16 %E12 to float
+  %Sl68 = select i1 %Cmp, <8 x i32> %B30, <8 x i32> zeroinitializer
+  %Cmp69 = fcmp ord double 0.000000e+00, 0.000000e+00
+  br i1 %Cmp69, label %CF77, label %CF79
+
+CF79:                                             ; preds = %CF81
+  %L70 = load i32* %A
+  store i64 %4, i64* %PC
+  %E71 = extractelement <4 x i32> zeroinitializer, i32 0
+  %Shuff72 = shufflevector <8 x i32> zeroinitializer, <8 x i32> %B44, <8 x i32> <i32 11, i32 undef, i32 15, i32 1, i32 3, i32 undef, i32 7, i32 9>
+  %I73 = insertelement <8 x i16> zeroinitializer, i16 %E12, i32 5
+  %B74 = fsub double 0.000000e+00, 0.000000e+00
+  %Sl75 = select i1 %Cmp46, i32 %E6, i32 %E19
+  %Cmp76 = icmp ugt <4 x i32> %I43, zeroinitializer
+  store i8 %L, i8* %Sl
+  store i64 %L47, i64* %PC
+  store i64 %L47, i64* %PC
+  store i8 %L5, i8* %Sl
+  store i8 %L5, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s1935737938.ll b/test/CodeGen/Mips/msa/llvm-stress-s1935737938.ll
new file mode 100644
index 0000000..f9cab03
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s1935737938.ll
@@ -0,0 +1,138 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a
+; `Opc && "Cannot copy registers"' assertion.
+; It should at least successfully build.
+
+define void @autogen_SD1935737938(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca i64
+  %A3 = alloca <4 x i32>
+  %A2 = alloca i64
+  %A1 = alloca i32
+  %A = alloca <2 x i64>
+  %L = load i8* %0
+  store i8 -1, i8* %0
+  %E = extractelement <2 x i32> zeroinitializer, i32 0
+  %Shuff = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I = insertelement <1 x i64> <i64 -1>, i64 286689, i32 0
+  %B = lshr i8 %L, -69
+  %ZE = fpext float 0xBF2AA5FE80000000 to double
+  %Sl = select i1 true, <1 x i64> <i64 -1>, <1 x i64> <i64 -1>
+  %L5 = load i8* %0
+  store i8 -69, i8* %0
+  %E6 = extractelement <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, i32 14
+  %Shuff7 = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I8 = insertelement <2 x i32> zeroinitializer, i32 135673, i32 1
+  %B9 = udiv i8 %B, %B
+  %FC = uitofp i32 %3 to double
+  %Sl10 = select i1 true, <1 x i1> zeroinitializer, <1 x i1> zeroinitializer
+  %Cmp = icmp ne <1 x i64> %I, <i64 -1>
+  %L11 = load i8* %0
+  store i8 %L11, i8* %0
+  %E12 = extractelement <1 x i64> <i64 -1>, i32 0
+  %Shuff13 = shufflevector <1 x i64> %Sl, <1 x i64> <i64 -1>, <1 x i32> <i32 1>
+  %I14 = insertelement <1 x i64> %I, i64 303290, i32 0
+  %B15 = frem float 0.000000e+00, 0.000000e+00
+  %Sl16 = select i1 true, <1 x i1> %Cmp, <1 x i1> zeroinitializer
+  %Cmp17 = fcmp one float 0xBD946F9840000000, %B15
+  br label %CF74
+
+CF74:                                             ; preds = %CF74, %CF80, %CF76, %BB
+  %L18 = load i8* %0
+  store i8 -69, i8* %0
+  %E19 = extractelement <1 x i64> %Sl, i32 0
+  %Shuff20 = shufflevector <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i32> <i32 12, i32 14, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>
+  %I21 = insertelement <2 x i32> %Shuff, i32 135673, i32 0
+  %B22 = urem i32 135673, %3
+  %FC23 = sitofp i8 %L to float
+  %Sl24 = select i1 true, i8 %B, i8 %L18
+  %L25 = load i8* %0
+  store i8 %L, i8* %0
+  %E26 = extractelement <2 x i32> %Shuff, i32 1
+  %Shuff27 = shufflevector <2 x i32> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> <i32 2, i32 0>
+  %I28 = insertelement <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, i64 %E12, i32 8
+  %B29 = frem double %ZE, 0x235104F0E94F406E
+  %Tr = trunc i64 286689 to i8
+  %Sl30 = select i1 true, float 0x45B13EA500000000, float %B15
+  %Cmp31 = icmp eq i32 %B22, %B22
+  br i1 %Cmp31, label %CF74, label %CF80
+
+CF80:                                             ; preds = %CF74
+  %L32 = load i8* %0
+  store i8 -1, i8* %0
+  %E33 = extractelement <2 x i32> zeroinitializer, i32 1
+  %Shuff34 = shufflevector <1 x i64> %Shuff13, <1 x i64> <i64 -1>, <1 x i32> zeroinitializer
+  %I35 = insertelement <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i8 -1, i32 0
+  %FC36 = sitofp <1 x i1> %Cmp to <1 x float>
+  %Sl37 = select i1 true, <8 x i8> %Shuff20, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %Cmp38 = icmp sgt <2 x i32> %I21, %Shuff27
+  %L39 = load i8* %0
+  store i8 %Sl24, i8* %0
+  %E40 = extractelement <8 x i64> zeroinitializer, i32 1
+  %Shuff41 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Cmp38, <2 x i32> <i32 0, i32 2>
+  %I42 = insertelement <4 x i32> zeroinitializer, i32 414573, i32 2
+  %B43 = srem i8 %L5, %L39
+  %Sl44 = select i1 %Cmp17, i8 %L, i8 %L
+  %Cmp45 = fcmp une float 0x3AFCE1A0C0000000, 0.000000e+00
+  br i1 %Cmp45, label %CF74, label %CF76
+
+CF76:                                             ; preds = %CF80
+  %L46 = load i8* %0
+  store i8 %L39, i8* %0
+  %E47 = extractelement <2 x i32> %Shuff27, i32 0
+  %Shuff48 = shufflevector <1 x i1> %Sl10, <1 x i1> %Sl10, <1 x i32> <i32 1>
+  %I49 = insertelement <1 x i64> <i64 -1>, i64 %E12, i32 0
+  %FC50 = fptosi double 0x235104F0E94F406E to i32
+  %Sl51 = select i1 %Cmp17, <16 x i64> %I28, <16 x i64> %I28
+  %Cmp52 = icmp ne i8 %Tr, %Sl24
+  br i1 %Cmp52, label %CF74, label %CF75
+
+CF75:                                             ; preds = %CF75, %CF76
+  %L53 = load i8* %0
+  store i8 %L18, i8* %0
+  %E54 = extractelement <8 x i8> %Shuff20, i32 5
+  %Shuff55 = shufflevector <2 x i32> %Shuff, <2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %I56 = insertelement <4 x i32> %I42, i32 %B22, i32 2
+  %B57 = sub i64 %E40, %E6
+  %Sl58 = select i1 true, i64 303290, i64 %E40
+  %Cmp59 = icmp slt i64 %E40, %E6
+  br i1 %Cmp59, label %CF75, label %CF78
+
+CF78:                                             ; preds = %CF75
+  %L60 = load i8* %0
+  store i8 -69, i8* %0
+  %E61 = extractelement <2 x i32> zeroinitializer, i32 0
+  %Shuff62 = shufflevector <2 x i32> %Shuff7, <2 x i32> %I21, <2 x i32> <i32 1, i32 3>
+  %I63 = insertelement <1 x i1> %Sl16, i1 %Cmp45, i32 0
+  %B64 = and i8 %Sl44, -69
+  %ZE65 = zext <1 x i1> %Shuff48 to <1 x i64>
+  %Sl66 = select i1 true, <1 x i64> %I, <1 x i64> %I49
+  %Cmp67 = icmp ugt i64 286689, %E40
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF78
+  %L68 = load i8* %0
+  store i64 %B57, i64* %2
+  %E69 = extractelement <2 x i1> %Shuff41, i32 1
+  br i1 %E69, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF
+  %Shuff70 = shufflevector <1 x i64> %Shuff34, <1 x i64> <i64 -1>, <1 x i32> zeroinitializer
+  %I71 = insertelement <2 x i32> %Shuff, i32 %E26, i32 0
+  %Se = sext i8 %L60 to i32
+  %Sl72 = select i1 %Cmp45, <2 x i32> %Shuff62, <2 x i32> %I71
+  %Cmp73 = fcmp ugt double 0x235104F0E94F406E, 0x235104F0E94F406E
+  br i1 %Cmp73, label %CF77, label %CF79
+
+CF79:                                             ; preds = %CF77
+  store i8 %L18, i8* %0
+  store i8 %E54, i8* %0
+  store i8 %L39, i8* %0
+  store i8 %L39, i8* %0
+  store i8 %B, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s2090927243-simplified.ll b/test/CodeGen/Mips/msa/llvm-stress-s2090927243-simplified.ll
new file mode 100644
index 0000000..3811314
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s2090927243-simplified.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a "Cannot select ..." error.
+; This was because undef's are ignored when checking if a vector constant is a
+; splat, but are legalized to zero if left in the DAG which changes the constant
+; into a non-splat.
+;
+; It should at least successfully build.
+
+define void @autogen_SD2090927243() {
+BB:
+  br label %CF77
+
+CF77:                                             ; preds = %CF77, %CF80
+  %Shuff27 = shufflevector <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>,
+                           <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>,
+                           <16 x i32> <i32 27, i32 29, i32 31, i32 1, i32 3, i32 5, i32 undef, i32 9, i32 11, i32 13, i32 undef, i32 17, i32 19, i32 21, i32 23, i32 undef>
+  %ZE30 = zext <16 x i8> %Shuff27 to <16 x i32>
+  %Cmp32 = fcmp ueq float undef, 0x3CDA6E5E40000000
+  br i1 %Cmp32, label %CF77, label %CF
+
+CF:                                               ; preds = %CF, %CF81
+  %E48 = extractelement <16 x i32> %ZE30, i32 14
+  br i1 undef, label %CF, label %CF78
+
+CF78:                                             ; preds = %CF
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s2501752154-simplified.ll b/test/CodeGen/Mips/msa/llvm-stress-s2501752154-simplified.ll
new file mode 100644
index 0000000..564ad74
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s2501752154-simplified.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a "Cannot select ..." error.
+; This happened because the legalizer treated undef's in the <4 x float>
+; constant as equivalent to the defined elements when checking if it a constant
+; splat, but then proceeded to legalize the undef's to zero, leaving it as a
+; non-splat that cannot be selected. It should have eliminated the undef's by
+; rewriting the splat constant.
+
+; It should at least successfully build.
+
+define void @autogen_SD2501752154() {
+BB:
+  %BC = bitcast <4 x i32> <i32 -1, i32 -1, i32 undef, i32 undef> to <4 x float>
+  br label %CF74
+
+CF74:                                             ; preds = %CF74, %CF
+  %E54 = extractelement <1 x i1> undef, i32 0
+  br i1 %E54, label %CF74, label %CF79
+
+CF79:                                             ; preds = %CF75
+  %I63 = insertelement <4 x float> %BC, float undef, i32 0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s2704903805.ll b/test/CodeGen/Mips/msa/llvm-stress-s2704903805.ll
new file mode 100644
index 0000000..e14f405
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s2704903805.ll
@@ -0,0 +1,141 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA after dereferencing a null this pointer.
+; It should at least successfully build.
+
+define void @autogen_SD2704903805(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca i32
+  %A3 = alloca i32
+  %A2 = alloca i8
+  %A1 = alloca i32
+  %A = alloca i8
+  %L = load i8* %0
+  store i8 %5, i8* %0
+  %E = extractelement <2 x i16> zeroinitializer, i32 0
+  %Shuff = shufflevector <1 x i8> <i8 -1>, <1 x i8> <i8 -1>, <1 x i32> undef
+  %I = insertelement <1 x i8> <i8 -1>, i8 85, i32 0
+  %B = lshr <2 x i16> zeroinitializer, zeroinitializer
+  %FC = sitofp <4 x i16> zeroinitializer to <4 x float>
+  %Sl = select i1 true, float 0.000000e+00, float 0x401E76A240000000
+  %Cmp = icmp ule i16 -25210, %E
+  br label %CF83
+
+CF83:                                             ; preds = %BB
+  %L5 = load i8* %0
+  store i8 85, i8* %0
+  %E6 = extractelement <1 x i8> <i8 -1>, i32 0
+  %Shuff7 = shufflevector <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I8 = insertelement <4 x i16> zeroinitializer, i16 %E, i32 3
+  %B9 = ashr <2 x i16> %Shuff7, zeroinitializer
+  %FC10 = sitofp i32 -1 to float
+  %Sl11 = select i1 %Cmp, i32 -1, i32 -1
+  %Cmp12 = icmp sgt i32 -1, -1
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF81, %CF83
+  %L13 = load i8* %0
+  store i8 0, i8* %0
+  %E14 = extractelement <2 x i64> zeroinitializer, i32 0
+  %Shuff15 = shufflevector <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
+  %I16 = insertelement <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i64 81222, i32 1
+  %B17 = lshr <2 x i16> zeroinitializer, %B
+  %Tr = trunc i32 272597 to i1
+  br i1 %Tr, label %CF, label %CF80
+
+CF80:                                             ; preds = %CF80, %CF
+  %Sl18 = select i1 %Cmp, <2 x i64> zeroinitializer, <2 x i64> zeroinitializer
+  %Cmp19 = icmp ne i1 %Cmp12, %Cmp
+  br i1 %Cmp19, label %CF80, label %CF81
+
+CF81:                                             ; preds = %CF80
+  %L20 = load i8* %0
+  store i8 85, i8* %0
+  %E21 = extractelement <1 x i8> <i8 -1>, i32 0
+  %Shuff22 = shufflevector <1 x i8> <i8 -1>, <1 x i8> %Shuff, <1 x i32> zeroinitializer
+  %I23 = insertelement <1 x i8> <i8 -1>, i8 %L5, i32 0
+  %FC24 = fptoui <4 x float> %FC to <4 x i16>
+  %Sl25 = select i1 %Cmp, <2 x i32> zeroinitializer, <2 x i32> <i32 -1, i32 -1>
+  %Cmp26 = icmp ult <4 x i64> %I16, %Shuff15
+  %L27 = load i8* %0
+  store i8 %L, i8* %0
+  %E28 = extractelement <1 x i8> <i8 -1>, i32 0
+  %Shuff29 = shufflevector <8 x i16> zeroinitializer, <8 x i16> zeroinitializer, <8 x i32> <i32 11, i32 undef, i32 15, i32 1, i32 3, i32 5, i32 undef, i32 9>
+  %I30 = insertelement <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i64 %E14, i32 1
+  %B31 = mul i8 %E28, 85
+  %PC = bitcast i32* %A3 to i32*
+  %Sl32 = select i1 %Cmp12, float %FC10, float 0x4712BFE680000000
+  %L33 = load i32* %PC
+  store i32 %L33, i32* %PC
+  %E34 = extractelement <2 x i16> zeroinitializer, i32 1
+  %Shuff35 = shufflevector <1 x i8> %Shuff, <1 x i8> <i8 -1>, <1 x i32> zeroinitializer
+  %I36 = insertelement <1 x i8> <i8 -1>, i8 %L13, i32 0
+  %B37 = xor i8 %L27, %L
+  %Sl38 = select i1 %Cmp, i16 %E34, i16 %E
+  %Cmp39 = icmp eq i1 %Cmp19, %Cmp
+  br i1 %Cmp39, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF81
+  %L40 = load i32* %PC
+  store i32 %3, i32* %PC
+  %E41 = extractelement <2 x i32> zeroinitializer, i32 0
+  %Shuff42 = shufflevector <2 x i32> <i32 -1, i32 -1>, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I43 = insertelement <1 x i8> <i8 -1>, i8 0, i32 0
+  %B44 = or i16 %E, -25210
+  %Se = sext i32 %3 to i64
+  %Sl45 = select i1 true, <1 x i8> %Shuff, <1 x i8> %I43
+  %Cmp46 = icmp sge <1 x i8> %I36, %Shuff
+  %L47 = load i32* %PC
+  store i32 %L33, i32* %PC
+  %E48 = extractelement <2 x i16> zeroinitializer, i32 0
+  %Shuff49 = shufflevector <1 x i8> <i8 -1>, <1 x i8> <i8 -1>, <1 x i32> <i32 1>
+  %I50 = insertelement <2 x i32> %Sl25, i32 47963, i32 1
+  %B51 = srem <1 x i8> %I, %Shuff22
+  %FC52 = sitofp i8 %5 to double
+  %Sl53 = select i1 %Cmp39, i8 %L27, i8 85
+  %Cmp54 = icmp slt i16 %E34, %E34
+  br i1 %Cmp54, label %CF77, label %CF78
+
+CF78:                                             ; preds = %CF78, %CF77
+  %L55 = load i32* %PC
+  store i32 %L33, i32* %PC
+  %E56 = extractelement <8 x i16> %Shuff29, i32 4
+  %Shuff57 = shufflevector <1 x i8> <i8 -1>, <1 x i8> <i8 -1>, <1 x i32> <i32 1>
+  %I58 = insertelement <1 x i8> %B51, i8 %Sl53, i32 0
+  %ZE = fpext float %FC10 to double
+  %Sl59 = select i1 %Cmp12, <2 x i16> %B9, <2 x i16> zeroinitializer
+  %Cmp60 = fcmp ult double 0.000000e+00, 0.000000e+00
+  br i1 %Cmp60, label %CF78, label %CF79
+
+CF79:                                             ; preds = %CF79, %CF78
+  %L61 = load i32* %PC
+  store i32 %L33, i32* %A3
+  %E62 = extractelement <4 x i64> %Shuff15, i32 1
+  %Shuff63 = shufflevector <8 x i16> %Shuff29, <8 x i16> %Shuff29, <8 x i32> <i32 undef, i32 10, i32 12, i32 undef, i32 undef, i32 undef, i32 4, i32 6>
+  %I64 = insertelement <2 x i64> zeroinitializer, i64 %Se, i32 0
+  %B65 = shl i8 %5, 85
+  %ZE66 = zext <4 x i1> %Cmp26 to <4 x i32>
+  %Sl67 = select i1 %Tr, <1 x i8> %Shuff, <1 x i8> %I23
+  %Cmp68 = fcmp olt float 0x4712BFE680000000, 0x4712BFE680000000
+  br i1 %Cmp68, label %CF79, label %CF82
+
+CF82:                                             ; preds = %CF79
+  %L69 = load i32* %PC
+  store i32 %L33, i32* %PC
+  %E70 = extractelement <8 x i16> zeroinitializer, i32 3
+  %Shuff71 = shufflevector <4 x i64> %Shuff15, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i32> <i32 6, i32 undef, i32 2, i32 4>
+  %I72 = insertelement <1 x i8> <i8 -1>, i8 %L, i32 0
+  %B73 = srem i64 %E62, %Se
+  %ZE74 = zext <4 x i1> %Cmp26 to <4 x i32>
+  %Sl75 = select i1 %Cmp, i32 463279, i32 %L61
+  %Cmp76 = icmp sgt <1 x i8> %Shuff49, %Shuff22
+  store i8 %B31, i8* %0
+  store i8 85, i8* %0
+  store i32 %L33, i32* %PC
+  store i8 %B65, i8* %0
+  store i8 %L5, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s3861334421.ll b/test/CodeGen/Mips/msa/llvm-stress-s3861334421.ll
new file mode 100644
index 0000000..1a03e55
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s3861334421.ll
@@ -0,0 +1,149 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a
+; "Don't know how to expand this condition!" unreachable.
+; It should at least successfully build.
+
+define void @autogen_SD3861334421(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <2 x i32>
+  %A3 = alloca <2 x double>
+  %A2 = alloca i64
+  %A1 = alloca i64
+  %A = alloca double
+  %L = load i8* %0
+  store i8 -101, i8* %0
+  %E = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 0
+  %Shuff = shufflevector <8 x i64> zeroinitializer, <8 x i64> zeroinitializer, <8 x i32> <i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 undef, i32 1>
+  %I = insertelement <8 x i64> zeroinitializer, i64 %4, i32 5
+  %B = and i64 116376, 57247
+  %FC = uitofp i8 7 to double
+  %Sl = select i1 false, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %L5 = load i8* %0
+  store i8 %L, i8* %0
+  %E6 = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 3
+  %Shuff7 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 6, i32 0>
+  %I8 = insertelement <8 x i8> %Sl, i8 7, i32 4
+  %B9 = or <8 x i64> zeroinitializer, zeroinitializer
+  %Sl10 = select i1 false, i64 116376, i64 380809
+  %Cmp = icmp sgt i32 394647, 17081
+  br label %CF
+
+CF:                                               ; preds = %CF, %BB
+  %L11 = load i8* %0
+  store i8 -87, i8* %0
+  %E12 = extractelement <4 x i64> zeroinitializer, i32 0
+  %Shuff13 = shufflevector <8 x i64> zeroinitializer, <8 x i64> zeroinitializer, <8 x i32> <i32 7, i32 9, i32 11, i32 13, i32 undef, i32 1, i32 3, i32 5>
+  %I14 = insertelement <4 x i64> zeroinitializer, i64 380809, i32 1
+  %B15 = srem i64 %Sl10, 380809
+  %FC16 = sitofp i64 57247 to float
+  %Sl17 = select i1 false, double 0x87A9374869A78EC6, double 0.000000e+00
+  %Cmp18 = icmp uge i8 %L, %5
+  br i1 %Cmp18, label %CF, label %CF80
+
+CF80:                                             ; preds = %CF80, %CF88, %CF
+  %L19 = load i8* %0
+  store i8 -101, i8* %0
+  %E20 = extractelement <4 x i64> zeroinitializer, i32 0
+  %Shuff21 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %Shuff7, <4 x i32> <i32 7, i32 1, i32 3, i32 5>
+  %I22 = insertelement <4 x i64> zeroinitializer, i64 127438, i32 1
+  %B23 = fdiv double %Sl17, 0.000000e+00
+  %Sl24 = select i1 %Cmp18, i32 420510, i32 492085
+  %Cmp25 = icmp ugt i1 %Cmp18, false
+  br i1 %Cmp25, label %CF80, label %CF83
+
+CF83:                                             ; preds = %CF83, %CF80
+  %L26 = load i8* %0
+  store i8 -87, i8* %0
+  %E27 = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 0
+  %Shuff28 = shufflevector <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 7, i32 1, i32 3, i32 5>
+  %I29 = insertelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 492085, i32 1
+  %B30 = lshr <8 x i8> %I8, %I8
+  %FC31 = sitofp <4 x i32> %Shuff28 to <4 x double>
+  %Sl32 = select i1 false, <8 x i8> %I8, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %Cmp33 = icmp eq i64 %B, 116376
+  br i1 %Cmp33, label %CF83, label %CF88
+
+CF88:                                             ; preds = %CF83
+  %L34 = load i8* %0
+  store i8 -87, i8* %0
+  %E35 = extractelement <8 x i64> %Shuff, i32 7
+  %Shuff36 = shufflevector <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %Shuff28, <4 x i32> <i32 2, i32 undef, i32 undef, i32 0>
+  %I37 = insertelement <4 x i64> zeroinitializer, i64 380809, i32 0
+  %B38 = xor <8 x i64> %B9, %B9
+  %ZE = zext i32 0 to i64
+  %Sl39 = select i1 %Cmp33, i8 %L11, i8 %L5
+  %Cmp40 = icmp sgt i1 %Cmp, false
+  br i1 %Cmp40, label %CF80, label %CF81
+
+CF81:                                             ; preds = %CF81, %CF85, %CF87, %CF88
+  %L41 = load i8* %0
+  store i8 %L34, i8* %0
+  %E42 = extractelement <8 x i64> %Shuff13, i32 6
+  %Shuff43 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 7>
+  %I44 = insertelement <4 x i64> zeroinitializer, i64 116376, i32 3
+  %B45 = fsub float %FC16, 0x3AC86DCC40000000
+  %Tr = trunc <4 x i64> %I14 to <4 x i32>
+  %Sl46 = select i1 false, <8 x i64> %B38, <8 x i64> zeroinitializer
+  %Cmp47 = icmp sgt i1 %Cmp18, %Cmp18
+  br i1 %Cmp47, label %CF81, label %CF85
+
+CF85:                                             ; preds = %CF81
+  %L48 = load i8* %0
+  store i8 -101, i8* %0
+  %E49 = extractelement <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i32 2
+  %Shuff50 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
+  %I51 = insertelement <4 x i64> zeroinitializer, i64 %E20, i32 3
+  %B52 = or i32 336955, %Sl24
+  %FC53 = uitofp i8 %L48 to double
+  %Sl54 = select i1 %Cmp47, i32 %3, i32 %Sl24
+  %Cmp55 = icmp ne <8 x i64> %Shuff13, zeroinitializer
+  %L56 = load i8* %0
+  store i8 %L11, i8* %0
+  %E57 = extractelement <4 x i64> %Shuff21, i32 1
+  %Shuff58 = shufflevector <8 x i64> %Shuff, <8 x i64> zeroinitializer, <8 x i32> <i32 4, i32 6, i32 undef, i32 10, i32 12, i32 undef, i32 0, i32 2>
+  %I59 = insertelement <4 x i64> zeroinitializer, i64 %E42, i32 2
+  %B60 = udiv <8 x i8> %Sl, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %Tr61 = trunc i8 49 to i1
+  br i1 %Tr61, label %CF81, label %CF84
+
+CF84:                                             ; preds = %CF84, %CF85
+  %Sl62 = select i1 false, i8 %L, i8 %L48
+  %Cmp63 = icmp ne <8 x i64> %I, zeroinitializer
+  %L64 = load i8* %0
+  store i8 %5, i8* %0
+  %E65 = extractelement <8 x i1> %Cmp55, i32 0
+  br i1 %E65, label %CF84, label %CF87
+
+CF87:                                             ; preds = %CF84
+  %Shuff66 = shufflevector <4 x i64> %Shuff21, <4 x i64> %I14, <4 x i32> <i32 3, i32 undef, i32 7, i32 1>
+  %I67 = insertelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 %Sl54, i32 1
+  %B68 = frem double %B23, %Sl17
+  %ZE69 = zext <8 x i8> %Sl32 to <8 x i64>
+  %Sl70 = select i1 %Tr61, i64 %E20, i64 %E12
+  %Cmp71 = icmp slt <8 x i64> %I, %Shuff
+  %L72 = load i8* %0
+  store i8 %L72, i8* %0
+  %E73 = extractelement <8 x i1> %Cmp55, i32 6
+  br i1 %E73, label %CF81, label %CF82
+
+CF82:                                             ; preds = %CF82, %CF87
+  %Shuff74 = shufflevector <4 x i32> %I67, <4 x i32> %I29, <4 x i32> <i32 1, i32 3, i32 undef, i32 7>
+  %I75 = insertelement <4 x i64> zeroinitializer, i64 380809, i32 3
+  %B76 = fsub double 0.000000e+00, %FC53
+  %Tr77 = trunc i32 %E to i8
+  %Sl78 = select i1 %Cmp18, i64* %A2, i64* %2
+  %Cmp79 = icmp eq i32 394647, 492085
+  br i1 %Cmp79, label %CF82, label %CF86
+
+CF86:                                             ; preds = %CF82
+  store i64 %Sl70, i64* %Sl78
+  store i64 %E57, i64* %Sl78
+  store i64 %Sl70, i64* %Sl78
+  store i64 %B, i64* %Sl78
+  store i64 %Sl10, i64* %Sl78
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s3926023935.ll b/test/CodeGen/Mips/msa/llvm-stress-s3926023935.ll
new file mode 100644
index 0000000..96547d9
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s3926023935.ll
@@ -0,0 +1,143 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a
+; "Type for zero vector elements is not legal" assertion.
+; It should at least successfully build.
+
+define void @autogen_SD3926023935(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca i1
+  %A3 = alloca float
+  %A2 = alloca double
+  %A1 = alloca float
+  %A = alloca double
+  %L = load i8* %0
+  store i8 -123, i8* %0
+  %E = extractelement <4 x i64> zeroinitializer, i32 1
+  %Shuff = shufflevector <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %I = insertelement <2 x i1> zeroinitializer, i1 false, i32 0
+  %BC = bitcast i64 181325 to double
+  %Sl = select i1 false, <2 x i32> zeroinitializer, <2 x i32> zeroinitializer
+  %Cmp = icmp ne <4 x i64> zeroinitializer, zeroinitializer
+  %L5 = load i8* %0
+  store i8 %L, i8* %0
+  %E6 = extractelement <4 x i64> zeroinitializer, i32 3
+  %Shuff7 = shufflevector <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, <2 x i32> <i32 2, i32 0>
+  %I8 = insertelement <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, i64 498254, i32 4
+  %B = shl i32 0, 364464
+  %Sl9 = select i1 false, i64 %E, i64 498254
+  %Cmp10 = icmp sge i8 -123, %5
+  br label %CF80
+
+CF80:                                             ; preds = %BB
+  %L11 = load i8* %0
+  store i8 -123, i8* %0
+  %E12 = extractelement <2 x i16> zeroinitializer, i32 1
+  %Shuff13 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %I14 = insertelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 %B, i32 2
+  %B15 = sdiv i64 334618, -1
+  %PC = bitcast i1* %A4 to i64*
+  %Sl16 = select i1 %Cmp10, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  %Cmp17 = icmp ule <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %Sl16
+  %L18 = load double* %A2
+  store i64 498254, i64* %PC
+  %E19 = extractelement <4 x i64> zeroinitializer, i32 0
+  %Shuff20 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %I, <2 x i32> <i32 3, i32 1>
+  %I21 = insertelement <2 x i1> zeroinitializer, i1 false, i32 1
+  %B22 = fadd double 0.000000e+00, %BC
+  %ZE = zext <2 x i1> %Shuff20 to <2 x i32>
+  %Sl23 = select i1 %Cmp10, <2 x i1> %Shuff20, <2 x i1> zeroinitializer
+  %Cmp24 = icmp ult <2 x i32> zeroinitializer, zeroinitializer
+  %L25 = load i8* %0
+  store i8 %L25, i8* %0
+  %E26 = extractelement <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>, i32 3
+  %Shuff27 = shufflevector <4 x i32> %Shuff, <4 x i32> %I14, <4 x i32> <i32 6, i32 0, i32 undef, i32 4>
+  %I28 = insertelement <4 x i32> zeroinitializer, i32 %3, i32 0
+  %B29 = lshr i8 %E26, -43
+  %Tr = trunc i8 %L5 to i1
+  br label %CF79
+
+CF79:                                             ; preds = %CF80
+  %Sl30 = select i1 false, i8 %B29, i8 -123
+  %Cmp31 = icmp sge <2 x i1> %I, %I
+  %L32 = load i64* %PC
+  store i8 -123, i8* %0
+  %E33 = extractelement <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, i32 2
+  %Shuff34 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %Shuff13, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
+  %I35 = insertelement <4 x i64> zeroinitializer, i64 498254, i32 3
+  %B36 = sub <8 x i64> %I8, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  %PC37 = bitcast i8* %0 to i1*
+  %Sl38 = select i1 %Cmp10, i8 -43, i8 %L5
+  %Cmp39 = icmp eq i64 498254, %B15
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF79
+  %L40 = load double* %A
+  store i1 %Cmp39, i1* %PC37
+  %E41 = extractelement <4 x i64> zeroinitializer, i32 3
+  %Shuff42 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %ZE, <2 x i32> <i32 2, i32 undef>
+  %I43 = insertelement <4 x i32> %Shuff, i32 %3, i32 0
+  %B44 = shl i64 %E41, -1
+  %Se = sext <2 x i1> %I to <2 x i32>
+  %Sl45 = select i1 %Cmp10, i1 false, i1 false
+  br i1 %Sl45, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF
+  %Cmp46 = fcmp uno double 0.000000e+00, 0.000000e+00
+  br i1 %Cmp46, label %CF77, label %CF78
+
+CF78:                                             ; preds = %CF78, %CF83, %CF82, %CF77
+  %L47 = load i64* %PC
+  store i8 -123, i8* %0
+  %E48 = extractelement <4 x i64> zeroinitializer, i32 3
+  %Shuff49 = shufflevector <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 6, i32 undef>
+  %I50 = insertelement <2 x i1> zeroinitializer, i1 %Cmp10, i32 0
+  %B51 = sdiv i64 %E19, 463132
+  %Tr52 = trunc i64 %E48 to i32
+  %Sl53 = select i1 %Tr, i1 %Cmp46, i1 %Cmp10
+  br i1 %Sl53, label %CF78, label %CF83
+
+CF83:                                             ; preds = %CF78
+  %Cmp54 = fcmp uge double %L40, %L40
+  br i1 %Cmp54, label %CF78, label %CF82
+
+CF82:                                             ; preds = %CF83
+  %L55 = load i64* %PC
+  store i64 %L32, i64* %PC
+  %E56 = extractelement <2 x i16> %Shuff7, i32 1
+  %Shuff57 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 6, i32 0>
+  %I58 = insertelement <2 x i32> %Sl, i32 %Tr52, i32 0
+  %B59 = or i32 %B, %3
+  %FC = sitofp i64 498254 to double
+  %Sl60 = select i1 false, i64 %E6, i64 -1
+  %Cmp61 = icmp sgt <4 x i32> %Shuff27, %I43
+  %L62 = load i64* %PC
+  store i64 %Sl9, i64* %PC
+  %E63 = extractelement <2 x i32> %ZE, i32 0
+  %Shuff64 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %Shuff13, <4 x i32> <i32 1, i32 3, i32 undef, i32 7>
+  %I65 = insertelement <4 x i32> %Shuff, i32 %3, i32 3
+  %B66 = sub i64 %L47, 53612
+  %Tr67 = trunc i64 %4 to i32
+  %Sl68 = select i1 %Cmp39, i1 %Cmp39, i1 false
+  br i1 %Sl68, label %CF78, label %CF81
+
+CF81:                                             ; preds = %CF82
+  %Cmp69 = icmp ne <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, %B36
+  %L70 = load i8* %0
+  store i64 %L55, i64* %PC
+  %E71 = extractelement <4 x i32> %Shuff49, i32 1
+  %Shuff72 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %Shuff34, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %I73 = insertelement <4 x i64> %Shuff64, i64 %E, i32 2
+  %B74 = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, %B36
+  %Sl75 = select i1 %Sl68, i64 %B51, i64 %L55
+  %Cmp76 = icmp sgt <8 x i64> %B74, %B36
+  store i1 %Cmp39, i1* %PC37
+  store i64 %E41, i64* %PC
+  store i64 %L32, i64* %PC
+  store i64 %Sl75, i64* %2
+  store i64 %L32, i64* %PC
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s3997499501.ll b/test/CodeGen/Mips/msa/llvm-stress-s3997499501.ll
new file mode 100644
index 0000000..bef75f3
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s3997499501.ll
@@ -0,0 +1,152 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed to select instructions for extract_vector_elt for
+; v4f32 on MSA.
+; It should at least successfully build.
+
+define void @autogen_SD3997499501(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <1 x double>
+  %A3 = alloca double
+  %A2 = alloca float
+  %A1 = alloca double
+  %A = alloca double
+  %L = load i8* %0
+  store i8 97, i8* %0
+  %E = extractelement <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, i32 14
+  %Shuff = shufflevector <2 x i1> zeroinitializer, <2 x i1> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I = insertelement <4 x i64> zeroinitializer, i64 0, i32 3
+  %Tr = trunc <1 x i64> zeroinitializer to <1 x i8>
+  %Sl = select i1 false, double* %A1, double* %A
+  %Cmp = icmp ne <2 x i64> zeroinitializer, zeroinitializer
+  %L5 = load double* %Sl
+  store float -4.374162e+06, float* %A2
+  %E6 = extractelement <4 x i64> zeroinitializer, i32 3
+  %Shuff7 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %I, <4 x i32> <i32 2, i32 4, i32 6, i32 undef>
+  %I8 = insertelement <2 x i1> %Shuff, i1 false, i32 0
+  %B = ashr <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %PC = bitcast float* %A2 to float*
+  %Sl9 = select i1 false, i32 82299, i32 0
+  %Cmp10 = icmp slt i8 97, %5
+  br label %CF72
+
+CF72:                                             ; preds = %CF72, %CF80, %CF78, %BB
+  %L11 = load double* %Sl
+  store double 0.000000e+00, double* %Sl
+  %E12 = extractelement <2 x i1> zeroinitializer, i32 0
+  br i1 %E12, label %CF72, label %CF80
+
+CF80:                                             ; preds = %CF72
+  %Shuff13 = shufflevector <2 x i1> zeroinitializer, <2 x i1> zeroinitializer, <2 x i32> <i32 3, i32 1>
+  %I14 = insertelement <2 x i64> zeroinitializer, i64 %4, i32 1
+  %B15 = fadd double %L5, 0.000000e+00
+  %BC = bitcast i32 0 to float
+  %Sl16 = select i1 %E12, float 0xC7957ED940000000, float %BC
+  %Cmp17 = icmp eq i32 136082, 471909
+  br i1 %Cmp17, label %CF72, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF80
+  %L18 = load double* %Sl
+  store double 0.000000e+00, double* %Sl
+  %E19 = extractelement <2 x i1> zeroinitializer, i32 0
+  br i1 %E19, label %CF77, label %CF78
+
+CF78:                                             ; preds = %CF77
+  %Shuff20 = shufflevector <2 x i1> zeroinitializer, <2 x i1> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %I21 = insertelement <8 x i1> zeroinitializer, i1 %Cmp10, i32 7
+  %B22 = sdiv <4 x i64> %Shuff7, zeroinitializer
+  %FC = uitofp i8 97 to double
+  %Sl23 = select i1 %Cmp10, <2 x i1> zeroinitializer, <2 x i1> zeroinitializer
+  %L24 = load double* %Sl
+  store float %Sl16, float* %PC
+  %E25 = extractelement <2 x i1> %Shuff, i32 1
+  br i1 %E25, label %CF72, label %CF76
+
+CF76:                                             ; preds = %CF78
+  %Shuff26 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %B22, <4 x i32> <i32 undef, i32 undef, i32 0, i32 undef>
+  %I27 = insertelement <4 x i64> zeroinitializer, i64 %E, i32 2
+  %B28 = mul <4 x i64> %I27, zeroinitializer
+  %ZE = zext <8 x i1> zeroinitializer to <8 x i64>
+  %Sl29 = select i1 %Cmp17, float -4.374162e+06, float -4.374162e+06
+  %L30 = load i8* %0
+  store double %L5, double* %Sl
+  %E31 = extractelement <8 x i1> zeroinitializer, i32 5
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF81, %CF76
+  %Shuff32 = shufflevector <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <16 x i32> <i32 8, i32 undef, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 undef, i32 26, i32 28, i32 30, i32 undef, i32 2, i32 4, i32 6>
+  %I33 = insertelement <8 x i1> zeroinitializer, i1 false, i32 2
+  %BC34 = bitcast <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> to <4 x float>
+  %Sl35 = select i1 %E12, <2 x i1> %I8, <2 x i1> zeroinitializer
+  %Cmp36 = fcmp oge double 0xC2C3BAE2D5C18360, 0xC2C3BAE2D5C18360
+  br i1 %Cmp36, label %CF, label %CF74
+
+CF74:                                             ; preds = %CF74, %CF
+  %L37 = load float* %PC
+  store double 0.000000e+00, double* %Sl
+  %E38 = extractelement <2 x i1> %Sl23, i32 1
+  br i1 %E38, label %CF74, label %CF75
+
+CF75:                                             ; preds = %CF75, %CF82, %CF74
+  %Shuff39 = shufflevector <2 x i1> %Shuff13, <2 x i1> zeroinitializer, <2 x i32> <i32 undef, i32 2>
+  %I40 = insertelement <4 x i64> zeroinitializer, i64 %4, i32 2
+  %Sl41 = select i1 %Cmp10, i32 0, i32 %3
+  %Cmp42 = icmp ne <1 x i64> zeroinitializer, zeroinitializer
+  %L43 = load double* %Sl
+  store i64 %4, i64* %2
+  %E44 = extractelement <2 x i1> %Shuff20, i32 1
+  br i1 %E44, label %CF75, label %CF82
+
+CF82:                                             ; preds = %CF75
+  %Shuff45 = shufflevector <2 x i1> %Sl23, <2 x i1> %Sl23, <2 x i32> <i32 2, i32 0>
+  %I46 = insertelement <4 x i64> zeroinitializer, i64 0, i32 0
+  %B47 = sub i64 %E, %E6
+  %Sl48 = select i1 %Cmp10, double %L5, double %L43
+  %Cmp49 = icmp uge i64 %4, %B47
+  br i1 %Cmp49, label %CF75, label %CF81
+
+CF81:                                             ; preds = %CF82
+  %L50 = load i8* %0
+  store double %L43, double* %Sl
+  %E51 = extractelement <4 x i64> %Shuff7, i32 3
+  %Shuff52 = shufflevector <4 x float> %BC34, <4 x float> %BC34, <4 x i32> <i32 2, i32 4, i32 6, i32 0>
+  %I53 = insertelement <2 x i1> %Cmp, i1 %E25, i32 0
+  %B54 = fdiv double %L24, %L43
+  %BC55 = bitcast <4 x i64> zeroinitializer to <4 x double>
+  %Sl56 = select i1 false, i8 %5, i8 97
+  %L57 = load i8* %0
+  store i8 %L50, i8* %0
+  %E58 = extractelement <2 x i1> %Shuff20, i32 1
+  br i1 %E58, label %CF, label %CF73
+
+CF73:                                             ; preds = %CF73, %CF81
+  %Shuff59 = shufflevector <2 x i1> %Shuff13, <2 x i1> %Shuff45, <2 x i32> <i32 undef, i32 0>
+  %I60 = insertelement <4 x float> %Shuff52, float -4.374162e+06, i32 0
+  %B61 = mul <4 x i64> %I46, zeroinitializer
+  %PC62 = bitcast double* %A3 to float*
+  %Sl63 = select i1 %Cmp10, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer
+  %Cmp64 = icmp ne <2 x i1> %Cmp, %Shuff
+  %L65 = load double* %A1
+  store float -4.374162e+06, float* %PC62
+  %E66 = extractelement <8 x i1> %I21, i32 3
+  br i1 %E66, label %CF73, label %CF79
+
+CF79:                                             ; preds = %CF79, %CF73
+  %Shuff67 = shufflevector <8 x i1> %I21, <8 x i1> %I21, <8 x i32> <i32 6, i32 8, i32 10, i32 12, i32 14, i32 0, i32 undef, i32 4>
+  %I68 = insertelement <1 x i1> %Cmp42, i1 %E25, i32 0
+  %B69 = sdiv <16 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  %Sl70 = select i1 %Cmp49, <2 x i1> %Sl23, <2 x i1> %Shuff45
+  %Cmp71 = icmp ne i1 false, false
+  br i1 %Cmp71, label %CF79, label %CF83
+
+CF83:                                             ; preds = %CF79
+  store double 0.000000e+00, double* %Sl
+  store float %BC, float* %PC62
+  store double %Sl48, double* %Sl
+  store double %FC, double* %Sl
+  store float %BC, float* %PC62
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s449609655-simplified.ll b/test/CodeGen/Mips/msa/llvm-stress-s449609655-simplified.ll
new file mode 100644
index 0000000..24e27cb
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s449609655-simplified.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test is based on an llvm-stress generated test case with seed=449609655
+
+; This test originally failed for MSA with a
+; "Comparison requires equal bit widths" assertion.
+; The legalizer legalized ; the <4 x i8>'s into <4 x i32>'s, then a call to
+; isVSplat() returned the splat value for <i8 -1, i8 -1, ...> as a 32-bit APInt
+; (255), but the zeroinitializer splat value as an 8-bit APInt (0). The
+; assertion occured when trying to check the values were bitwise inverses of
+; each-other.
+;
+; It should at least successfully build.
+
+define void @autogen_SD449609655(i8) {
+BB:
+  %Cmp = icmp ult i8 -3, %0
+  br label %CF78
+
+CF78:                                             ; preds = %CF81, %CF78, %BB
+  %Sl31 = select i1 %Cmp, <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>, <4 x i8> zeroinitializer
+  br i1 undef, label %CF78, label %CF81
+
+CF81:                                             ; preds = %CF78
+  br i1 undef, label %CF78, label %CF80
+
+CF80:                                             ; preds = %CF81
+  %I59 = insertelement <4 x i8> %Sl31, i8 undef, i32 1
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s525530439.ll b/test/CodeGen/Mips/msa/llvm-stress-s525530439.ll
new file mode 100644
index 0000000..697871d
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s525530439.ll
@@ -0,0 +1,139 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed for MSA with a
+; `Num < NumOperands && "Invalid child # of SDNode!"' assertion.
+; It should at least successfully build.
+
+define void @autogen_SD525530439(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca i32
+  %A3 = alloca double
+  %A2 = alloca <1 x double>
+  %A1 = alloca <8 x double>
+  %A = alloca i64
+  %L = load i8* %0
+  store i64 33695, i64* %A
+  %E = extractelement <4 x i32> zeroinitializer, i32 3
+  %Shuff = shufflevector <2 x i32> <i32 -1, i32 -1>, <2 x i32> <i32 -1, i32 -1>, <2 x i32> <i32 2, i32 0>
+  %I = insertelement <4 x i16> zeroinitializer, i16 -11642, i32 0
+  %B = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %ZE = fpext float 0x3B64A2B880000000 to double
+  %Sl = select i1 true, i16 -1, i16 -11642
+  %L5 = load i8* %0
+  store i8 0, i8* %0
+  %E6 = extractelement <4 x i32> zeroinitializer, i32 2
+  %Shuff7 = shufflevector <8 x i1> zeroinitializer, <8 x i1> zeroinitializer, <8 x i32> <i32 undef, i32 7, i32 9, i32 11, i32 13, i32 15, i32 1, i32 undef>
+  %I8 = insertelement <4 x i32> zeroinitializer, i32 %3, i32 3
+  %B9 = sub i32 71140, 439732
+  %BC = bitcast <2 x i32> <i32 -1, i32 -1> to <2 x float>
+  %Sl10 = select i1 true, i32* %1, i32* %1
+  %Cmp = icmp sge <8 x i64> zeroinitializer, zeroinitializer
+  %L11 = load i32* %Sl10
+  store <1 x double> zeroinitializer, <1 x double>* %A2
+  %E12 = extractelement <4 x i16> zeroinitializer, i32 0
+  %Shuff13 = shufflevector <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i32> undef
+  %I14 = insertelement <1 x i16> zeroinitializer, i16 %Sl, i32 0
+  %B15 = or i16 -1, %E12
+  %BC16 = bitcast <4 x i32> zeroinitializer to <4 x float>
+  %Sl17 = select i1 true, i64 %4, i64 %4
+  %Cmp18 = fcmp ugt float 0xC5ABB1BF80000000, 0x3EEF3D6300000000
+  br label %CF75
+
+CF75:                                             ; preds = %CF75, %BB
+  %L19 = load i32* %Sl10
+  store i32 %L11, i32* %Sl10
+  %E20 = extractelement <4 x i32> zeroinitializer, i32 1
+  %Shuff21 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %I8, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
+  %I22 = insertelement <4 x float> %BC16, float 0x3EEF3D6300000000, i32 2
+  %B23 = shl i32 71140, 439732
+  %ZE24 = fpext <4 x float> %I22 to <4 x double>
+  %Sl25 = select i1 %Cmp18, i32 %L11, i32 %L11
+  %Cmp26 = icmp ne i32 %E20, %L19
+  br i1 %Cmp26, label %CF75, label %CF76
+
+CF76:                                             ; preds = %CF75
+  %L27 = load i32* %Sl10
+  store i32 439732, i32* %Sl10
+  %E28 = extractelement <4 x i32> %Shuff21, i32 3
+  %Shuff29 = shufflevector <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 0>
+  %I30 = insertelement <8 x i1> %Shuff7, i1 %Cmp18, i32 4
+  %Sl31 = select i1 %Cmp18, i32 %3, i32 %B23
+  %Cmp32 = icmp ugt i32 0, %3
+  br label %CF74
+
+CF74:                                             ; preds = %CF74, %CF80, %CF78, %CF76
+  %L33 = load i64* %2
+  store i32 71140, i32* %Sl10
+  %E34 = extractelement <4 x i32> zeroinitializer, i32 1
+  %Shuff35 = shufflevector <1 x i16> zeroinitializer, <1 x i16> zeroinitializer, <1 x i32> undef
+  %I36 = insertelement <4 x i16> zeroinitializer, i16 -11642, i32 0
+  %B37 = mul <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %Shuff29
+  %Sl38 = select i1 %Cmp18, double 0.000000e+00, double 0x2BA9DB480DA732C6
+  %Cmp39 = icmp sgt i16 -11642, %Sl
+  br i1 %Cmp39, label %CF74, label %CF80
+
+CF80:                                             ; preds = %CF74
+  %L40 = load i8* %0
+  store i32 0, i32* %Sl10
+  %E41 = extractelement <8 x i64> zeroinitializer, i32 1
+  %Shuff42 = shufflevector <1 x i16> %I14, <1 x i16> %I14, <1 x i32> undef
+  %I43 = insertelement <4 x i16> %I36, i16 -11642, i32 0
+  %FC = fptoui float 0x455CA2B080000000 to i16
+  %Sl44 = select i1 %Cmp18, i1 %Cmp18, i1 %Cmp39
+  br i1 %Sl44, label %CF74, label %CF78
+
+CF78:                                             ; preds = %CF80
+  %L45 = load i32* %Sl10
+  store i8 %L5, i8* %0
+  %E46 = extractelement <8 x i1> %Shuff7, i32 2
+  br i1 %E46, label %CF74, label %CF77
+
+CF77:                                             ; preds = %CF77, %CF78
+  %Shuff47 = shufflevector <4 x i16> %I43, <4 x i16> zeroinitializer, <4 x i32> <i32 5, i32 undef, i32 1, i32 3>
+  %I48 = insertelement <1 x i16> %Shuff42, i16 %Sl, i32 0
+  %B49 = mul i8 0, %L40
+  %FC50 = uitofp i32 %3 to double
+  %Sl51 = select i1 %Sl44, i32 %L27, i32 0
+  %Cmp52 = icmp sge i8 %B49, 0
+  br i1 %Cmp52, label %CF77, label %CF79
+
+CF79:                                             ; preds = %CF77
+  %L53 = load i32* %Sl10
+  store i8 %L40, i8* %0
+  %E54 = extractelement <4 x i32> zeroinitializer, i32 1
+  %Shuff55 = shufflevector <4 x i32> %Shuff21, <4 x i32> %I8, <4 x i32> <i32 4, i32 6, i32 undef, i32 2>
+  %I56 = insertelement <4 x i32> zeroinitializer, i32 %Sl51, i32 2
+  %Tr = trunc <1 x i64> %Shuff13 to <1 x i16>
+  %Sl57 = select i1 %Cmp18, <2 x i32> <i32 -1, i32 -1>, <2 x i32> <i32 -1, i32 -1>
+  %Cmp58 = icmp uge <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %I56
+  %L59 = load i8* %0
+  store <1 x double> zeroinitializer, <1 x double>* %A2
+  %E60 = extractelement <4 x i32> zeroinitializer, i32 0
+  %Shuff61 = shufflevector <4 x i32> %I8, <4 x i32> %I8, <4 x i32> <i32 undef, i32 1, i32 undef, i32 undef>
+  %I62 = insertelement <4 x i16> zeroinitializer, i16 %E12, i32 1
+  %B63 = and <4 x i32> %Shuff61, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %PC = bitcast double* %A3 to i32*
+  %Sl64 = select i1 %Cmp18, <4 x i32> %Shuff61, <4 x i32> %Shuff55
+  %Cmp65 = icmp sgt i32 439732, %3
+  br label %CF
+
+CF:                                               ; preds = %CF79
+  %L66 = load i32* %Sl10
+  store i32 %E6, i32* %PC
+  %E67 = extractelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 2
+  %Shuff68 = shufflevector <4 x i32> %Sl64, <4 x i32> %I8, <4 x i32> <i32 5, i32 undef, i32 1, i32 undef>
+  %I69 = insertelement <4 x i16> %Shuff47, i16 %Sl, i32 3
+  %B70 = sdiv <4 x i64> zeroinitializer, zeroinitializer
+  %FC71 = sitofp i32 %L66 to double
+  %Sl72 = select i1 %Cmp18, i64 %4, i64 %4
+  %Cmp73 = icmp eq <4 x i64> zeroinitializer, %B70
+  store i32 %B23, i32* %PC
+  store i32 %3, i32* %PC
+  store i32 %3, i32* %Sl10
+  store i32 %L27, i32* %1
+  store i32 0, i32* %PC
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-s997348632.ll b/test/CodeGen/Mips/msa/llvm-stress-s997348632.ll
new file mode 100644
index 0000000..dc4200a
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-s997348632.ll
@@ -0,0 +1,143 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed to select instructions for extract_vector_elt for
+; v2f64 on MSA.
+; It should at least successfully build.
+
+define void @autogen_SD997348632(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca <2 x i32>
+  %A3 = alloca <16 x i16>
+  %A2 = alloca <4 x i1>
+  %A1 = alloca <4 x i16>
+  %A = alloca <2 x i32>
+  %L = load i8* %0
+  store i8 %L, i8* %0
+  %E = extractelement <4 x i32> zeroinitializer, i32 0
+  %Shuff = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 undef, i32 1, i32 3, i32 5>
+  %I = insertelement <2 x i1> zeroinitializer, i1 false, i32 1
+  %FC = sitofp <4 x i32> zeroinitializer to <4 x double>
+  %Sl = select i1 false, <4 x i64> %Shuff, <4 x i64> %Shuff
+  %L5 = load i8* %0
+  store i8 %5, i8* %0
+  %E6 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff7 = shufflevector <2 x i1> %I, <2 x i1> %I, <2 x i32> <i32 1, i32 undef>
+  %I8 = insertelement <1 x i16> zeroinitializer, i16 0, i32 0
+  %B = xor i32 376034, %3
+  %FC9 = fptoui float 0x406DB70180000000 to i64
+  %Sl10 = select i1 false, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %Cmp = icmp ult <4 x i64> zeroinitializer, zeroinitializer
+  %L11 = load i8* %0
+  store i8 %L, i8* %0
+  %E12 = extractelement <4 x i64> zeroinitializer, i32 2
+  %Shuff13 = shufflevector <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <4 x i32> <i32 5, i32 7, i32 undef, i32 3>
+  %I14 = insertelement <8 x i32> zeroinitializer, i32 -1, i32 7
+  %B15 = fdiv <4 x double> %FC, %FC
+  %Tr = trunc i32 376034 to i16
+  %Sl16 = select i1 false, <8 x i32> %Sl10, <8 x i32> zeroinitializer
+  %Cmp17 = icmp uge i32 233658, %E
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF79, %CF84, %BB
+  %L18 = load i8* %0
+  store i8 %L, i8* %0
+  %E19 = extractelement <4 x i64> %Sl, i32 3
+  %Shuff20 = shufflevector <2 x i1> %Shuff7, <2 x i1> %I, <2 x i32> <i32 2, i32 0>
+  %I21 = insertelement <4 x i64> zeroinitializer, i64 %FC9, i32 0
+  %B22 = xor <8 x i32> %I14, %I14
+  %Tr23 = trunc i16 0 to i8
+  %Sl24 = select i1 false, <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <8 x i32> zeroinitializer
+  %Cmp25 = icmp eq i1 false, false
+  br i1 %Cmp25, label %CF, label %CF79
+
+CF79:                                             ; preds = %CF
+  %L26 = load i8* %0
+  store i8 %L26, i8* %0
+  %E27 = extractelement <1 x i16> zeroinitializer, i32 0
+  %Shuff28 = shufflevector <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, <16 x i32> <i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+  %I29 = insertelement <16 x i32> %Shuff28, i32 %B, i32 15
+  %B30 = fdiv float 0.000000e+00, -6.749110e+06
+  %Sl31 = select i1 false, i32 %3, i32 %3
+  %Cmp32 = fcmp uno float 0.000000e+00, 0x406DB70180000000
+  br i1 %Cmp32, label %CF, label %CF78
+
+CF78:                                             ; preds = %CF78, %CF79
+  %L33 = load i8* %0
+  store i8 %L, i8* %0
+  %E34 = extractelement <16 x i32> %Shuff28, i32 1
+  %Shuff35 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %I21, <4 x i32> <i32 undef, i32 6, i32 0, i32 2>
+  %I36 = insertelement <4 x double> %FC, double 0xA4A57F449CA36CC2, i32 2
+  %Se = sext <4 x i1> %Cmp to <4 x i32>
+  %Sl37 = select i1 %Cmp17, i32 0, i32 0
+  %Cmp38 = icmp ne i32 440284, 376034
+  br i1 %Cmp38, label %CF78, label %CF80
+
+CF80:                                             ; preds = %CF80, %CF82, %CF78
+  %L39 = load i8* %0
+  store i8 %L, i8* %0
+  %E40 = extractelement <2 x i1> %Shuff20, i32 1
+  br i1 %E40, label %CF80, label %CF82
+
+CF82:                                             ; preds = %CF80
+  %Shuff41 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Shuff20, <2 x i32> <i32 2, i32 0>
+  %I42 = insertelement <2 x i1> %Shuff41, i1 false, i32 0
+  %B43 = sub i32 %E, 0
+  %Sl44 = select i1 %Cmp32, <16 x i32> %Shuff28, <16 x i32> %Shuff28
+  %Cmp45 = icmp sgt <4 x i64> zeroinitializer, %I21
+  %L46 = load i8* %0
+  store i8 %L11, i8* %0
+  %E47 = extractelement <8 x i32> %Sl16, i32 4
+  %Shuff48 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %Shuff7, <2 x i32> <i32 undef, i32 1>
+  %I49 = insertelement <2 x i1> %Shuff48, i1 %Cmp17, i32 1
+  %B50 = and <8 x i32> %I14, %Sl10
+  %FC51 = fptoui float -6.749110e+06 to i1
+  br i1 %FC51, label %CF80, label %CF81
+
+CF81:                                             ; preds = %CF81, %CF82
+  %Sl52 = select i1 false, float -6.749110e+06, float 0x406DB70180000000
+  %Cmp53 = icmp uge <2 x i32> <i32 -1, i32 -1>, <i32 -1, i32 -1>
+  %L54 = load i8* %0
+  store i8 %L5, i8* %0
+  %E55 = extractelement <8 x i32> zeroinitializer, i32 7
+  %Shuff56 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 undef, i32 4, i32 6, i32 0>
+  %I57 = insertelement <2 x i1> %Shuff7, i1 false, i32 0
+  %B58 = fmul <4 x double> %FC, %FC
+  %FC59 = fptoui <4 x double> %I36 to <4 x i16>
+  %Sl60 = select i1 %Cmp17, <2 x i1> %I, <2 x i1> %I57
+  %Cmp61 = icmp ule <8 x i32> %B50, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %L62 = load i8* %0
+  store i8 %L33, i8* %0
+  %E63 = extractelement <4 x i64> %Shuff, i32 2
+  %Shuff64 = shufflevector <4 x i64> %Shuff56, <4 x i64> %Shuff56, <4 x i32> <i32 5, i32 7, i32 1, i32 undef>
+  %I65 = insertelement <2 x i1> zeroinitializer, i1 false, i32 1
+  %B66 = sdiv i32 %B, %E55
+  %Tr67 = trunc i8 %L54 to i1
+  br i1 %Tr67, label %CF81, label %CF83
+
+CF83:                                             ; preds = %CF83, %CF81
+  %Sl68 = select i1 %Cmp17, i1 %Cmp25, i1 %Tr67
+  br i1 %Sl68, label %CF83, label %CF84
+
+CF84:                                             ; preds = %CF83
+  %Cmp69 = icmp uge i32 %E, %E34
+  br i1 %Cmp69, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF84
+  %L70 = load i8* %0
+  store i8 %L, i8* %0
+  %E71 = extractelement <4 x i64> %Shuff, i32 0
+  %Shuff72 = shufflevector <2 x i1> zeroinitializer, <2 x i1> %I, <2 x i32> <i32 3, i32 1>
+  %I73 = insertelement <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, i32 %B66, i32 1
+  %FC74 = uitofp i1 %Cmp32 to double
+  %Sl75 = select i1 %FC51, i16 9704, i16 0
+  %Cmp76 = icmp ugt <1 x i16> %I8, %I8
+  store i8 %L39, i8* %0
+  store i8 %5, i8* %0
+  store i8 %Tr23, i8* %0
+  store i8 %L, i8* %0
+  store i8 %5, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/llvm-stress-sz1-s742806235.ll b/test/CodeGen/Mips/msa/llvm-stress-sz1-s742806235.ll
new file mode 100644
index 0000000..8c4fcba
--- /dev/null
+++ b/test/CodeGen/Mips/msa/llvm-stress-sz1-s742806235.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=mips < %s
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s
+; RUN: llc -march=mipsel < %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s
+
+; This test originally failed to select code for a truncstore of a
+; build_vector.
+; It should at least successfully build.
+
+define void @autogen_SD742806235(i8*, i32*, i64*, i32, i64, i8) {
+BB:
+  %A4 = alloca double
+  %A3 = alloca double
+  %A2 = alloca <8 x i8>
+  %A1 = alloca <4 x float>
+  %A = alloca i1
+  store i8 %5, i8* %0
+  store i8 %5, i8* %0
+  store i8 %5, i8* %0
+  store <8 x i8> <i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 -1, i8 0, i8 -1>, <8 x i8>* %A2
+  store i8 %5, i8* %0
+  ret void
+}
diff --git a/test/CodeGen/Mips/msa/shift-dagcombine.ll b/test/CodeGen/Mips/msa/shift-dagcombine.ll
new file mode 100644
index 0000000..0d809fb
--- /dev/null
+++ b/test/CodeGen/Mips/msa/shift-dagcombine.ll
@@ -0,0 +1,70 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @ashr_v4i32(<4 x i32>* %c) nounwind {
+  ; CHECK-LABEL: ashr_v4i32:
+
+  %1 = ashr <4 x i32> <i32 1, i32 2, i32 4, i32 8>,
+                      <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: sra
+  ; CHECK-DAG: ldi.w [[R1:\$w[0-9]+]], 1
+  ; CHECK-NOT: sra
+  store volatile <4 x i32> %1, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  %2 = ashr <4 x i32> <i32 -2, i32 -4, i32 -8, i32 -16>,
+                      <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: sra
+  ; CHECK-DAG: ldi.w [[R1:\$w[0-9]+]], -2
+  ; CHECK-NOT: sra
+  store volatile <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK-LABEL: .size ashr_v4i32
+}
+
+define void @lshr_v4i32(<4 x i32>* %c) nounwind {
+  ; CHECK-LABEL: lshr_v4i32:
+
+  %1 = lshr <4 x i32> <i32 1, i32 2, i32 4, i32 8>,
+                      <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: srl
+  ; CHECK-DAG: ldi.w [[R1:\$w[0-9]+]], 1
+  ; CHECK-NOT: srl
+  store volatile <4 x i32> %1, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  %2 = lshr <4 x i32> <i32 -2, i32 -4, i32 -8, i32 -16>,
+                      <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: srl
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], %lo
+  ; CHECK-NOT: srl
+  store volatile <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK-LABEL: .size lshr_v4i32
+}
+
+define void @shl_v4i32(<4 x i32>* %c) nounwind {
+  ; CHECK-LABEL: shl_v4i32:
+
+  %1 = shl <4 x i32> <i32 8, i32 4, i32 2, i32 1>,
+                     <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: sll
+  ; CHECK-DAG: ldi.w [[R1:\$w[0-9]+]], 8
+  ; CHECK-NOT: sll
+  store volatile <4 x i32> %1, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  %2 = shl <4 x i32> <i32 -8, i32 -4, i32 -2, i32 -1>,
+                     <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NOT: sll
+  ; CHECK-DAG: ldi.w [[R1:\$w[0-9]+]], -8
+  ; CHECK-NOT: sll
+  store volatile <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R1]], 0($4)
+
+  ret void
+  ; CHECK-LABEL: .size shl_v4i32
+}
diff --git a/test/CodeGen/Mips/msa/shuffle.ll b/test/CodeGen/Mips/msa/shuffle.ll
new file mode 100644
index 0000000..316c669
--- /dev/null
+++ b/test/CodeGen/Mips/msa/shuffle.ll
@@ -0,0 +1,803 @@
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define void @vshf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: vshf_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R1]]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v16i8_0
+}
+
+define void @vshf_v16i8_1(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: vshf_v16i8_1:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][1]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v16i8_1
+}
+
+define void @vshf_v16i8_2(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: vshf_v16i8_2:
+
+  %1 = load <16 x i8>* %a
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 16>
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.b [[R3]], [[R2]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v16i8_2
+}
+
+define void @vshf_v16i8_3(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: vshf_v16i8_3:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> <i32 17, i32 24, i32 25, i32 18, i32 19, i32 20, i32 28, i32 19, i32 1, i32 8, i32 9, i32 2, i32 3, i32 4, i32 12, i32 3>
+  ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.b [[R3]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v16i8_3
+}
+
+define void @vshf_v16i8_4(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: vshf_v16i8_4:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> %1, <16 x i32> <i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17, i32 1, i32 17>
+  ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][1]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v16i8_4
+}
+
+define void @vshf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: vshf_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R1]]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v8i16_0
+}
+
+define void @vshf_v8i16_1(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: vshf_v8i16_1:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][1]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v8i16_1
+}
+
+define void @vshf_v8i16_2(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: vshf_v8i16_2:
+
+  %1 = load <8 x i16>* %a
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8>
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.h [[R3]], [[R2]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v8i16_2
+}
+
+define void @vshf_v8i16_3(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: vshf_v8i16_3:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 8, i32 9, i32 2, i32 3, i32 4, i32 12, i32 3>
+  ; CHECK-DAG: ld.h [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.h [[R3]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v8i16_3
+}
+
+define void @vshf_v8i16_4(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: vshf_v8i16_4:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <8 x i16> %1, <8 x i16> %1, <8 x i32> <i32 1, i32 9, i32 1, i32 9, i32 1, i32 9, i32 1, i32 9>
+  ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][1]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v8i16_4
+}
+
+; Note: v4i32 only has one 4-element set so it's impossible to get a vshf.w
+; instruction when using a single vector.
+
+define void @vshf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: vshf_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 27
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v4i32_0
+}
+
+define void @vshf_v4i32_1(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: vshf_v4i32_1:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v4i32_1
+}
+
+define void @vshf_v4i32_2(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: vshf_v4i32_2:
+
+  %1 = load <4 x i32>* %a
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 4, i32 5, i32 6, i32 4>
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R2]], 36
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v4i32_2
+}
+
+define void @vshf_v4i32_3(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: vshf_v4i32_3:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 5, i32 6, i32 4>
+  ; CHECK-DAG: ld.w [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.w [[R3]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v4i32_3
+}
+
+define void @vshf_v4i32_4(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: vshf_v4i32_4:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> %1, <4 x i32> <i32 1, i32 5, i32 5, i32 1>
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 85
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v4i32_4
+}
+
+define void @vshf_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: vshf_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R1]]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v2i64_0
+}
+
+define void @vshf_v2i64_1(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: vshf_v2i64_1:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v2i64_1
+}
+
+define void @vshf_v2i64_2(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: vshf_v2i64_2:
+
+  %1 = load <2 x i64>* %a
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 3, i32 2>
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.d [[R3]], [[R2]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v2i64_2
+}
+
+define void @vshf_v2i64_3(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: vshf_v2i64_3:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 2>
+  ; CHECK-DAG: ld.d [[R3:\$w[0-9]+]], %lo
+  ; CHECK-DAG: vshf.d [[R3]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v2i64_3
+}
+
+define void @vshf_v2i64_4(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: vshf_v2i64_4:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <2 x i64> %1, <2 x i64> %1, <2 x i32> <i32 1, i32 3>
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size vshf_v2i64_4
+}
+
+define void @shf_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: shf_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 2, i32 0, i32 5, i32 7, i32 6, i32 4, i32 9, i32 11, i32 10, i32 8, i32 13, i32 15, i32 14, i32 12>
+  ; CHECK-DAG: shf.b [[R3:\$w[0-9]+]], [[R1]], 45
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size shf_v16i8_0
+}
+
+define void @shf_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: shf_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  ; CHECK-DAG: shf.h [[R3:\$w[0-9]+]], [[R1]], 27
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size shf_v8i16_0
+}
+
+define void @shf_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: shf_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 27
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size shf_v4i32_0
+}
+
+; shf.d does not exist
+
+define void @ilvev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: ilvev_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+  ; CHECK-DAG: ilvev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvev_v16i8_0
+}
+
+define void @ilvev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: ilvev_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ; CHECK-DAG: ilvev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvev_v8i16_0
+}
+
+define void @ilvev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: ilvev_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ; CHECK-DAG: ilvev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvev_v4i32_0
+}
+
+define void @ilvev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: ilvev_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
+  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvev_v2i64_0
+}
+
+define void @ilvod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: ilvod_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+  ; CHECK-DAG: ilvod.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvod_v16i8_0
+}
+
+define void @ilvod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: ilvod_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ; CHECK-DAG: ilvod.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvod_v8i16_0
+}
+
+define void @ilvod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: ilvod_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ; CHECK-DAG: ilvod.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvod_v4i32_0
+}
+
+define void @ilvod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: ilvod_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
+  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvod_v2i64_0
+}
+
+define void @ilvl_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: ilvl_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  ; CHECK-DAG: ilvl.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvl_v16i8_0
+}
+
+define void @ilvl_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: ilvl_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ; CHECK-DAG: ilvl.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvl_v8i16_0
+}
+
+define void @ilvl_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: ilvl_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ; CHECK-DAG: ilvl.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvl_v4i32_0
+}
+
+define void @ilvl_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: ilvl_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
+  ; ilvl.d and ilvev.d are equivalent for v2i64
+  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvl_v2i64_0
+}
+
+define void @ilvr_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: ilvr_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ; CHECK-DAG: ilvr.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvr_v16i8_0
+}
+
+define void @ilvr_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: ilvr_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ; CHECK-DAG: ilvr.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvr_v8i16_0
+}
+
+define void @ilvr_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: ilvr_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ; CHECK-DAG: ilvr.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvr_v4i32_0
+}
+
+define void @ilvr_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: ilvr_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
+  ; ilvr.d and ilvod.d are equivalent for v2i64
+  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size ilvr_v2i64_0
+}
+
+define void @pckev_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: pckev_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  ; CHECK-DAG: pckev.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckev_v16i8_0
+}
+
+define void @pckev_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: pckev_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  ; CHECK-DAG: pckev.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckev_v8i16_0
+}
+
+define void @pckev_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: pckev_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ; CHECK-DAG: pckev.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckev_v4i32_0
+}
+
+define void @pckev_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: pckev_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 0, i32 2>
+  ; pckev.d and ilvev.d are equivalent for v2i64
+  ; CHECK-DAG: ilvev.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckev_v2i64_0
+}
+
+define void @pckod_v16i8_0(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind {
+  ; CHECK: pckod_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <16 x i8>* %b
+  ; CHECK-DAG: ld.b [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <16 x i8> %1, <16 x i8> %2,
+                     <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  ; CHECK-DAG: pckod.b [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <16 x i8> %3, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckod_v16i8_0
+}
+
+define void @pckod_v8i16_0(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind {
+  ; CHECK: pckod_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <8 x i16>* %b
+  ; CHECK-DAG: ld.h [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ; CHECK-DAG: pckod.h [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <8 x i16> %3, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckod_v8i16_0
+}
+
+define void @pckod_v4i32_0(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind {
+  ; CHECK: pckod_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <4 x i32>* %b
+  ; CHECK-DAG: ld.w [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <4 x i32> %1, <4 x i32> %2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ; CHECK-DAG: pckod.w [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <4 x i32> %3, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckod_v4i32_0
+}
+
+define void @pckod_v2i64_0(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+  ; CHECK: pckod_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = load <2 x i64>* %b
+  ; CHECK-DAG: ld.d [[R2:\$w[0-9]+]], 0($6)
+  %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <2 x i32> <i32 1, i32 3>
+  ; pckod.d and ilvod.d are equivalent for v2i64
+  ; CHECK-DAG: ilvod.d [[R3:\$w[0-9]+]], [[R1]], [[R2]]
+  store <2 x i64> %3, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size pckod_v2i64_0
+}
+
+define void @splati_v16i8_0(<16 x i8>* %c, <16 x i8>* %a) nounwind {
+  ; CHECK: splati_v16i8_0:
+
+  %1 = load <16 x i8>* %a
+  ; CHECK-DAG: ld.b [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef,
+                     <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  ; CHECK-DAG: splati.b [[R3:\$w[0-9]+]], [[R1]][4]
+  store <16 x i8> %2, <16 x i8>* %c
+  ; CHECK-DAG: st.b [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size splati_v16i8_0
+}
+
+define void @splati_v8i16_0(<8 x i16>* %c, <8 x i16>* %a) nounwind {
+  ; CHECK: splati_v8i16_0:
+
+  %1 = load <8 x i16>* %a
+  ; CHECK-DAG: ld.h [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  ; CHECK-DAG: splati.h [[R3:\$w[0-9]+]], [[R1]][4]
+  store <8 x i16> %2, <8 x i16>* %c
+  ; CHECK-DAG: st.h [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size splati_v8i16_0
+}
+
+define void @splati_v4i32_0(<4 x i32>* %c, <4 x i32>* %a) nounwind {
+  ; CHECK: splati_v4i32_0:
+
+  %1 = load <4 x i32>* %a
+  ; CHECK-DAG: ld.w [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  ; shf.w and splati.w are equivalent
+  ; CHECK-DAG: shf.w [[R3:\$w[0-9]+]], [[R1]], 255
+  store <4 x i32> %2, <4 x i32>* %c
+  ; CHECK-DAG: st.w [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size splati_v4i32_0
+}
+
+define void @splati_v2i64_0(<2 x i64>* %c, <2 x i64>* %a) nounwind {
+  ; CHECK: splati_v2i64_0:
+
+  %1 = load <2 x i64>* %a
+  ; CHECK-DAG: ld.d [[R1:\$w[0-9]+]], 0($5)
+  %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ; CHECK-DAG: splati.d [[R3:\$w[0-9]+]], [[R1]][1]
+  store <2 x i64> %2, <2 x i64>* %c
+  ; CHECK-DAG: st.d [[R3]], 0($4)
+
+  ret void
+  ; CHECK: .size splati_v2i64_0
+}
diff --git a/test/CodeGen/Mips/msa/special.ll b/test/CodeGen/Mips/msa/special.ll
new file mode 100644
index 0000000..60a4369
--- /dev/null
+++ b/test/CodeGen/Mips/msa/special.ll
@@ -0,0 +1,26 @@
+; Test the MSA intrinsics that are encoded with the SPECIAL instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define i32 @llvm_mips_lsa_test(i32 %a, i32 %b) nounwind {
+entry:
+  %0 = tail call i32 @llvm.mips.lsa(i32 %a, i32 %b, i32 2)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.lsa(i32, i32, i32) nounwind
+
+; CHECK: llvm_mips_lsa_test:
+; CHECK: lsa {{\$[0-9]+}}, {{\$[0-9]+}}, {{\$[0-9]+}}, 2
+; CHECK: .size llvm_mips_lsa_test
+
+define i32 @lsa_test(i32 %a, i32 %b) nounwind {
+entry:
+  %0 = shl i32 %b, 2
+  %1 = add i32 %a, %0
+  ret i32 %1
+}
+
+; CHECK: lsa_test:
+; CHECK: lsa {{\$[0-9]+}}, {{\$[0-9]+}}, {{\$[0-9]+}}, 2
+; CHECK: .size lsa_test
diff --git a/test/CodeGen/Mips/msa/spill.ll b/test/CodeGen/Mips/msa/spill.ll
new file mode 100644
index 0000000..66f896a
--- /dev/null
+++ b/test/CodeGen/Mips/msa/spill.ll
@@ -0,0 +1,601 @@
+; Test that the correct instruction is chosen for spill and reload by trying
+; to have 33 live MSA registers simultaneously
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+define i32 @test_i8(<16 x i8>* %p0, <16 x i8>* %q1) nounwind {
+entry:
+  %p1  = getelementptr <16 x i8>* %p0, i32 1
+  %p2  = getelementptr <16 x i8>* %p0, i32 2
+  %p3  = getelementptr <16 x i8>* %p0, i32 3
+  %p4  = getelementptr <16 x i8>* %p0, i32 4
+  %p5  = getelementptr <16 x i8>* %p0, i32 5
+  %p6  = getelementptr <16 x i8>* %p0, i32 6
+  %p7  = getelementptr <16 x i8>* %p0, i32 7
+  %p8  = getelementptr <16 x i8>* %p0, i32 8
+  %p9  = getelementptr <16 x i8>* %p0, i32 9
+  %p10 = getelementptr <16 x i8>* %p0, i32 10
+  %p11 = getelementptr <16 x i8>* %p0, i32 11
+  %p12 = getelementptr <16 x i8>* %p0, i32 12
+  %p13 = getelementptr <16 x i8>* %p0, i32 13
+  %p14 = getelementptr <16 x i8>* %p0, i32 14
+  %p15 = getelementptr <16 x i8>* %p0, i32 15
+  %p16 = getelementptr <16 x i8>* %p0, i32 16
+  %p17 = getelementptr <16 x i8>* %p0, i32 17
+  %p18 = getelementptr <16 x i8>* %p0, i32 18
+  %p19 = getelementptr <16 x i8>* %p0, i32 19
+  %p20 = getelementptr <16 x i8>* %p0, i32 20
+  %p21 = getelementptr <16 x i8>* %p0, i32 21
+  %p22 = getelementptr <16 x i8>* %p0, i32 22
+  %p23 = getelementptr <16 x i8>* %p0, i32 23
+  %p24 = getelementptr <16 x i8>* %p0, i32 24
+  %p25 = getelementptr <16 x i8>* %p0, i32 25
+  %p26 = getelementptr <16 x i8>* %p0, i32 26
+  %p27 = getelementptr <16 x i8>* %p0, i32 27
+  %p28 = getelementptr <16 x i8>* %p0, i32 28
+  %p29 = getelementptr <16 x i8>* %p0, i32 29
+  %p30 = getelementptr <16 x i8>* %p0, i32 30
+  %p31 = getelementptr <16 x i8>* %p0, i32 31
+  %p32 = getelementptr <16 x i8>* %p0, i32 32
+  %p33 = getelementptr <16 x i8>* %p0, i32 33
+  %0  = load <16 x i8>* %p0, align 16
+  %1  = load <16 x i8>* %p1, align 16
+  %2  = load <16 x i8>* %p2, align 16
+  %3  = load <16 x i8>* %p3, align 16
+  %4  = load <16 x i8>* %p4, align 16
+  %5  = load <16 x i8>* %p5, align 16
+  %6  = load <16 x i8>* %p6, align 16
+  %7  = load <16 x i8>* %p7, align 16
+  %8  = load <16 x i8>* %p8, align 16
+  %9  = load <16 x i8>* %p9, align 16
+  %10 = load <16 x i8>* %p10, align 16
+  %11 = load <16 x i8>* %p11, align 16
+  %12 = load <16 x i8>* %p12, align 16
+  %13 = load <16 x i8>* %p13, align 16
+  %14 = load <16 x i8>* %p14, align 16
+  %15 = load <16 x i8>* %p15, align 16
+  %16 = load <16 x i8>* %p16, align 16
+  %17 = load <16 x i8>* %p17, align 16
+  %18 = load <16 x i8>* %p18, align 16
+  %19 = load <16 x i8>* %p19, align 16
+  %20 = load <16 x i8>* %p20, align 16
+  %21 = load <16 x i8>* %p21, align 16
+  %22 = load <16 x i8>* %p22, align 16
+  %23 = load <16 x i8>* %p23, align 16
+  %24 = load <16 x i8>* %p24, align 16
+  %25 = load <16 x i8>* %p25, align 16
+  %26 = load <16 x i8>* %p26, align 16
+  %27 = load <16 x i8>* %p27, align 16
+  %28 = load <16 x i8>* %p28, align 16
+  %29 = load <16 x i8>* %p29, align 16
+  %30 = load <16 x i8>* %p30, align 16
+  %31 = load <16 x i8>* %p31, align 16
+  %32 = load <16 x i8>* %p32, align 16
+  %33 = load <16 x i8>* %p33, align 16
+  %r1  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0,   <16 x i8> %1)
+  %r2  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r1,  <16 x i8> %2)
+  %r3  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r2,  <16 x i8> %3)
+  %r4  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r3,  <16 x i8> %4)
+  %r5  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r4,  <16 x i8> %5)
+  %r6  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r5,  <16 x i8> %6)
+  %r7  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r6,  <16 x i8> %7)
+  %r8  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r7,  <16 x i8> %8)
+  %r9  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r8,  <16 x i8> %9)
+  %r10 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r9,  <16 x i8> %10)
+  %r11 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r10, <16 x i8> %11)
+  %r12 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r11, <16 x i8> %12)
+  %r13 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r12, <16 x i8> %13)
+  %r14 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r13, <16 x i8> %14)
+  %r15 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r14, <16 x i8> %15)
+  %r16 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r15, <16 x i8> %16)
+  %r17 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r16, <16 x i8> %17)
+  %r18 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r17, <16 x i8> %18)
+  %r19 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r18, <16 x i8> %19)
+  %r20 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r19, <16 x i8> %20)
+  %r21 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r20, <16 x i8> %21)
+  %r22 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r21, <16 x i8> %22)
+  %r23 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r22, <16 x i8> %23)
+  %r24 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r23, <16 x i8> %24)
+  %r25 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r24, <16 x i8> %25)
+  %r26 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r25, <16 x i8> %26)
+  %r27 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r26, <16 x i8> %27)
+  %r28 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r27, <16 x i8> %28)
+  %r29 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r28, <16 x i8> %29)
+  %r30 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r29, <16 x i8> %30)
+  %r31 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r30, <16 x i8> %31)
+  %r32 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r31, <16 x i8> %32)
+  %r33 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r32, <16 x i8> %33)
+  %rx1  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %r33,   <16 x i8> %1)
+  %rx2  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx1,  <16 x i8> %2)
+  %rx3  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx2,  <16 x i8> %3)
+  %rx4  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx3,  <16 x i8> %4)
+  %rx5  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx4,  <16 x i8> %5)
+  %rx6  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx5,  <16 x i8> %6)
+  %rx7  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx6,  <16 x i8> %7)
+  %rx8  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx7,  <16 x i8> %8)
+  %rx9  = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx8,  <16 x i8> %9)
+  %rx10 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx9,  <16 x i8> %10)
+  %rx11 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx10, <16 x i8> %11)
+  %rx12 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx11, <16 x i8> %12)
+  %rx13 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx12, <16 x i8> %13)
+  %rx14 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx13, <16 x i8> %14)
+  %rx15 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx14, <16 x i8> %15)
+  %rx16 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx15, <16 x i8> %16)
+  %rx17 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx16, <16 x i8> %17)
+  %rx18 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx17, <16 x i8> %18)
+  %rx19 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx18, <16 x i8> %19)
+  %rx20 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx19, <16 x i8> %20)
+  %rx21 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx20, <16 x i8> %21)
+  %rx22 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx21, <16 x i8> %22)
+  %rx23 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx22, <16 x i8> %23)
+  %rx24 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx23, <16 x i8> %24)
+  %rx25 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx24, <16 x i8> %25)
+  %rx26 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx25, <16 x i8> %26)
+  %rx27 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx26, <16 x i8> %27)
+  %rx28 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx27, <16 x i8> %28)
+  %rx29 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx28, <16 x i8> %29)
+  %rx30 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx29, <16 x i8> %30)
+  %rx31 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx30, <16 x i8> %31)
+  %rx32 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx31, <16 x i8> %32)
+  %rx33 = call <16 x i8> @llvm.mips.addv.b(<16 x i8> %rx32, <16 x i8> %33)
+  %res = call i32 @llvm.mips.copy.s.b(<16 x i8> %rx33, i32 0)
+  ret i32 %res
+}
+
+declare <16 x i8> @llvm.mips.addv.b(<16 x i8>, <16 x i8>) nounwind
+declare i32       @llvm.mips.copy.s.b(<16 x i8>, i32) nounwind
+
+; CHECK: test_i8:
+; CHECK: st.b {{.*}} Spill
+; CHECK: st.b {{.*}} Spill
+; CHECK: ld.b {{.*}} Reload
+; CHECK: ld.b {{.*}} Reload
+; CHECK: .size
+
+define i32 @test_i16(<8 x i16>* %p0, <8 x i16>* %q1) nounwind {
+entry:
+  %p1  = getelementptr <8 x i16>* %p0, i32 1
+  %p2  = getelementptr <8 x i16>* %p0, i32 2
+  %p3  = getelementptr <8 x i16>* %p0, i32 3
+  %p4  = getelementptr <8 x i16>* %p0, i32 4
+  %p5  = getelementptr <8 x i16>* %p0, i32 5
+  %p6  = getelementptr <8 x i16>* %p0, i32 6
+  %p7  = getelementptr <8 x i16>* %p0, i32 7
+  %p8  = getelementptr <8 x i16>* %p0, i32 8
+  %p9  = getelementptr <8 x i16>* %p0, i32 9
+  %p10 = getelementptr <8 x i16>* %p0, i32 10
+  %p11 = getelementptr <8 x i16>* %p0, i32 11
+  %p12 = getelementptr <8 x i16>* %p0, i32 12
+  %p13 = getelementptr <8 x i16>* %p0, i32 13
+  %p14 = getelementptr <8 x i16>* %p0, i32 14
+  %p15 = getelementptr <8 x i16>* %p0, i32 15
+  %p16 = getelementptr <8 x i16>* %p0, i32 16
+  %p17 = getelementptr <8 x i16>* %p0, i32 17
+  %p18 = getelementptr <8 x i16>* %p0, i32 18
+  %p19 = getelementptr <8 x i16>* %p0, i32 19
+  %p20 = getelementptr <8 x i16>* %p0, i32 20
+  %p21 = getelementptr <8 x i16>* %p0, i32 21
+  %p22 = getelementptr <8 x i16>* %p0, i32 22
+  %p23 = getelementptr <8 x i16>* %p0, i32 23
+  %p24 = getelementptr <8 x i16>* %p0, i32 24
+  %p25 = getelementptr <8 x i16>* %p0, i32 25
+  %p26 = getelementptr <8 x i16>* %p0, i32 26
+  %p27 = getelementptr <8 x i16>* %p0, i32 27
+  %p28 = getelementptr <8 x i16>* %p0, i32 28
+  %p29 = getelementptr <8 x i16>* %p0, i32 29
+  %p30 = getelementptr <8 x i16>* %p0, i32 30
+  %p31 = getelementptr <8 x i16>* %p0, i32 31
+  %p32 = getelementptr <8 x i16>* %p0, i32 32
+  %p33 = getelementptr <8 x i16>* %p0, i32 33
+  %0  = load <8 x i16>* %p0, align 16
+  %1  = load <8 x i16>* %p1, align 16
+  %2  = load <8 x i16>* %p2, align 16
+  %3  = load <8 x i16>* %p3, align 16
+  %4  = load <8 x i16>* %p4, align 16
+  %5  = load <8 x i16>* %p5, align 16
+  %6  = load <8 x i16>* %p6, align 16
+  %7  = load <8 x i16>* %p7, align 16
+  %8  = load <8 x i16>* %p8, align 16
+  %9  = load <8 x i16>* %p9, align 16
+  %10 = load <8 x i16>* %p10, align 16
+  %11 = load <8 x i16>* %p11, align 16
+  %12 = load <8 x i16>* %p12, align 16
+  %13 = load <8 x i16>* %p13, align 16
+  %14 = load <8 x i16>* %p14, align 16
+  %15 = load <8 x i16>* %p15, align 16
+  %16 = load <8 x i16>* %p16, align 16
+  %17 = load <8 x i16>* %p17, align 16
+  %18 = load <8 x i16>* %p18, align 16
+  %19 = load <8 x i16>* %p19, align 16
+  %20 = load <8 x i16>* %p20, align 16
+  %21 = load <8 x i16>* %p21, align 16
+  %22 = load <8 x i16>* %p22, align 16
+  %23 = load <8 x i16>* %p23, align 16
+  %24 = load <8 x i16>* %p24, align 16
+  %25 = load <8 x i16>* %p25, align 16
+  %26 = load <8 x i16>* %p26, align 16
+  %27 = load <8 x i16>* %p27, align 16
+  %28 = load <8 x i16>* %p28, align 16
+  %29 = load <8 x i16>* %p29, align 16
+  %30 = load <8 x i16>* %p30, align 16
+  %31 = load <8 x i16>* %p31, align 16
+  %32 = load <8 x i16>* %p32, align 16
+  %33 = load <8 x i16>* %p33, align 16
+  %r1  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0,   <8 x i16> %1)
+  %r2  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r1,  <8 x i16> %2)
+  %r3  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r2,  <8 x i16> %3)
+  %r4  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r3,  <8 x i16> %4)
+  %r5  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r4,  <8 x i16> %5)
+  %r6  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r5,  <8 x i16> %6)
+  %r7  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r6,  <8 x i16> %7)
+  %r8  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r7,  <8 x i16> %8)
+  %r9  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r8,  <8 x i16> %9)
+  %r10 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r9,  <8 x i16> %10)
+  %r11 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r10, <8 x i16> %11)
+  %r12 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r11, <8 x i16> %12)
+  %r13 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r12, <8 x i16> %13)
+  %r14 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r13, <8 x i16> %14)
+  %r15 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r14, <8 x i16> %15)
+  %r16 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r15, <8 x i16> %16)
+  %r17 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r16, <8 x i16> %17)
+  %r18 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r17, <8 x i16> %18)
+  %r19 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r18, <8 x i16> %19)
+  %r20 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r19, <8 x i16> %20)
+  %r21 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r20, <8 x i16> %21)
+  %r22 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r21, <8 x i16> %22)
+  %r23 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r22, <8 x i16> %23)
+  %r24 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r23, <8 x i16> %24)
+  %r25 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r24, <8 x i16> %25)
+  %r26 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r25, <8 x i16> %26)
+  %r27 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r26, <8 x i16> %27)
+  %r28 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r27, <8 x i16> %28)
+  %r29 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r28, <8 x i16> %29)
+  %r30 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r29, <8 x i16> %30)
+  %r31 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r30, <8 x i16> %31)
+  %r32 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r31, <8 x i16> %32)
+  %r33 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r32, <8 x i16> %33)
+  %rx1  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %r33,   <8 x i16> %1)
+  %rx2  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx1,  <8 x i16> %2)
+  %rx3  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx2,  <8 x i16> %3)
+  %rx4  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx3,  <8 x i16> %4)
+  %rx5  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx4,  <8 x i16> %5)
+  %rx6  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx5,  <8 x i16> %6)
+  %rx7  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx6,  <8 x i16> %7)
+  %rx8  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx7,  <8 x i16> %8)
+  %rx9  = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx8,  <8 x i16> %9)
+  %rx10 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx9,  <8 x i16> %10)
+  %rx11 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx10, <8 x i16> %11)
+  %rx12 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx11, <8 x i16> %12)
+  %rx13 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx12, <8 x i16> %13)
+  %rx14 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx13, <8 x i16> %14)
+  %rx15 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx14, <8 x i16> %15)
+  %rx16 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx15, <8 x i16> %16)
+  %rx17 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx16, <8 x i16> %17)
+  %rx18 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx17, <8 x i16> %18)
+  %rx19 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx18, <8 x i16> %19)
+  %rx20 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx19, <8 x i16> %20)
+  %rx21 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx20, <8 x i16> %21)
+  %rx22 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx21, <8 x i16> %22)
+  %rx23 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx22, <8 x i16> %23)
+  %rx24 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx23, <8 x i16> %24)
+  %rx25 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx24, <8 x i16> %25)
+  %rx26 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx25, <8 x i16> %26)
+  %rx27 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx26, <8 x i16> %27)
+  %rx28 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx27, <8 x i16> %28)
+  %rx29 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx28, <8 x i16> %29)
+  %rx30 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx29, <8 x i16> %30)
+  %rx31 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx30, <8 x i16> %31)
+  %rx32 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx31, <8 x i16> %32)
+  %rx33 = call <8 x i16> @llvm.mips.addv.h(<8 x i16> %rx32, <8 x i16> %33)
+  %res = call i32 @llvm.mips.copy.s.h(<8 x i16> %rx33, i32 0)
+  ret i32 %res
+}
+
+declare <8 x i16> @llvm.mips.addv.h(<8 x i16>, <8 x i16>) nounwind
+declare i32       @llvm.mips.copy.s.h(<8 x i16>, i32) nounwind
+
+; CHECK: test_i16:
+; CHECK: st.h {{.*}} Spill
+; CHECK: st.h {{.*}} Spill
+; CHECK: ld.h {{.*}} Reload
+; CHECK: ld.h {{.*}} Reload
+; CHECK: .size
+
+define i32 @test_i32(<4 x i32>* %p0, <4 x i32>* %q1) nounwind {
+entry:
+  %p1  = getelementptr <4 x i32>* %p0, i32 1
+  %p2  = getelementptr <4 x i32>* %p0, i32 2
+  %p3  = getelementptr <4 x i32>* %p0, i32 3
+  %p4  = getelementptr <4 x i32>* %p0, i32 4
+  %p5  = getelementptr <4 x i32>* %p0, i32 5
+  %p6  = getelementptr <4 x i32>* %p0, i32 6
+  %p7  = getelementptr <4 x i32>* %p0, i32 7
+  %p8  = getelementptr <4 x i32>* %p0, i32 8
+  %p9  = getelementptr <4 x i32>* %p0, i32 9
+  %p10 = getelementptr <4 x i32>* %p0, i32 10
+  %p11 = getelementptr <4 x i32>* %p0, i32 11
+  %p12 = getelementptr <4 x i32>* %p0, i32 12
+  %p13 = getelementptr <4 x i32>* %p0, i32 13
+  %p14 = getelementptr <4 x i32>* %p0, i32 14
+  %p15 = getelementptr <4 x i32>* %p0, i32 15
+  %p16 = getelementptr <4 x i32>* %p0, i32 16
+  %p17 = getelementptr <4 x i32>* %p0, i32 17
+  %p18 = getelementptr <4 x i32>* %p0, i32 18
+  %p19 = getelementptr <4 x i32>* %p0, i32 19
+  %p20 = getelementptr <4 x i32>* %p0, i32 20
+  %p21 = getelementptr <4 x i32>* %p0, i32 21
+  %p22 = getelementptr <4 x i32>* %p0, i32 22
+  %p23 = getelementptr <4 x i32>* %p0, i32 23
+  %p24 = getelementptr <4 x i32>* %p0, i32 24
+  %p25 = getelementptr <4 x i32>* %p0, i32 25
+  %p26 = getelementptr <4 x i32>* %p0, i32 26
+  %p27 = getelementptr <4 x i32>* %p0, i32 27
+  %p28 = getelementptr <4 x i32>* %p0, i32 28
+  %p29 = getelementptr <4 x i32>* %p0, i32 29
+  %p30 = getelementptr <4 x i32>* %p0, i32 30
+  %p31 = getelementptr <4 x i32>* %p0, i32 31
+  %p32 = getelementptr <4 x i32>* %p0, i32 32
+  %p33 = getelementptr <4 x i32>* %p0, i32 33
+  %0  = load <4 x i32>* %p0, align 16
+  %1  = load <4 x i32>* %p1, align 16
+  %2  = load <4 x i32>* %p2, align 16
+  %3  = load <4 x i32>* %p3, align 16
+  %4  = load <4 x i32>* %p4, align 16
+  %5  = load <4 x i32>* %p5, align 16
+  %6  = load <4 x i32>* %p6, align 16
+  %7  = load <4 x i32>* %p7, align 16
+  %8  = load <4 x i32>* %p8, align 16
+  %9  = load <4 x i32>* %p9, align 16
+  %10 = load <4 x i32>* %p10, align 16
+  %11 = load <4 x i32>* %p11, align 16
+  %12 = load <4 x i32>* %p12, align 16
+  %13 = load <4 x i32>* %p13, align 16
+  %14 = load <4 x i32>* %p14, align 16
+  %15 = load <4 x i32>* %p15, align 16
+  %16 = load <4 x i32>* %p16, align 16
+  %17 = load <4 x i32>* %p17, align 16
+  %18 = load <4 x i32>* %p18, align 16
+  %19 = load <4 x i32>* %p19, align 16
+  %20 = load <4 x i32>* %p20, align 16
+  %21 = load <4 x i32>* %p21, align 16
+  %22 = load <4 x i32>* %p22, align 16
+  %23 = load <4 x i32>* %p23, align 16
+  %24 = load <4 x i32>* %p24, align 16
+  %25 = load <4 x i32>* %p25, align 16
+  %26 = load <4 x i32>* %p26, align 16
+  %27 = load <4 x i32>* %p27, align 16
+  %28 = load <4 x i32>* %p28, align 16
+  %29 = load <4 x i32>* %p29, align 16
+  %30 = load <4 x i32>* %p30, align 16
+  %31 = load <4 x i32>* %p31, align 16
+  %32 = load <4 x i32>* %p32, align 16
+  %33 = load <4 x i32>* %p33, align 16
+  %r1 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %1)
+  %r2 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r1, <4 x i32> %2)
+  %r3 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r2, <4 x i32> %3)
+  %r4 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r3, <4 x i32> %4)
+  %r5 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r4, <4 x i32> %5)
+  %r6 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r5, <4 x i32> %6)
+  %r7 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r6, <4 x i32> %7)
+  %r8 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r7, <4 x i32> %8)
+  %r9 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r8, <4 x i32> %9)
+  %r10 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r9, <4 x i32> %10)
+  %r11 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r10, <4 x i32> %11)
+  %r12 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r11, <4 x i32> %12)
+  %r13 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r12, <4 x i32> %13)
+  %r14 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r13, <4 x i32> %14)
+  %r15 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r14, <4 x i32> %15)
+  %r16 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r15, <4 x i32> %16)
+  %r17 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r16, <4 x i32> %17)
+  %r18 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r17, <4 x i32> %18)
+  %r19 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r18, <4 x i32> %19)
+  %r20 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r19, <4 x i32> %20)
+  %r21 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r20, <4 x i32> %21)
+  %r22 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r21, <4 x i32> %22)
+  %r23 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r22, <4 x i32> %23)
+  %r24 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r23, <4 x i32> %24)
+  %r25 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r24, <4 x i32> %25)
+  %r26 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r25, <4 x i32> %26)
+  %r27 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r26, <4 x i32> %27)
+  %r28 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r27, <4 x i32> %28)
+  %r29 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r28, <4 x i32> %29)
+  %r30 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r29, <4 x i32> %30)
+  %r31 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r30, <4 x i32> %31)
+  %r32 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r31, <4 x i32> %32)
+  %r33 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r32, <4 x i32> %33)
+  %rx1 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %r33, <4 x i32> %1)
+  %rx2 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx1, <4 x i32> %2)
+  %rx3 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx2, <4 x i32> %3)
+  %rx4 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx3, <4 x i32> %4)
+  %rx5 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx4, <4 x i32> %5)
+  %rx6 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx5, <4 x i32> %6)
+  %rx7 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx6, <4 x i32> %7)
+  %rx8 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx7, <4 x i32> %8)
+  %rx9 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx8, <4 x i32> %9)
+  %rx10 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx9, <4 x i32> %10)
+  %rx11 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx10, <4 x i32> %11)
+  %rx12 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx11, <4 x i32> %12)
+  %rx13 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx12, <4 x i32> %13)
+  %rx14 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx13, <4 x i32> %14)
+  %rx15 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx14, <4 x i32> %15)
+  %rx16 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx15, <4 x i32> %16)
+  %rx17 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx16, <4 x i32> %17)
+  %rx18 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx17, <4 x i32> %18)
+  %rx19 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx18, <4 x i32> %19)
+  %rx20 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx19, <4 x i32> %20)
+  %rx21 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx20, <4 x i32> %21)
+  %rx22 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx21, <4 x i32> %22)
+  %rx23 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx22, <4 x i32> %23)
+  %rx24 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx23, <4 x i32> %24)
+  %rx25 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx24, <4 x i32> %25)
+  %rx26 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx25, <4 x i32> %26)
+  %rx27 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx26, <4 x i32> %27)
+  %rx28 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx27, <4 x i32> %28)
+  %rx29 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx28, <4 x i32> %29)
+  %rx30 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx29, <4 x i32> %30)
+  %rx31 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx30, <4 x i32> %31)
+  %rx32 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx31, <4 x i32> %32)
+  %rx33 = call <4 x i32> @llvm.mips.addv.w(<4 x i32> %rx32, <4 x i32> %33)
+  %res = call i32 @llvm.mips.copy.s.w(<4 x i32> %rx33, i32 0)
+  ret i32 %res
+}
+
+declare <4 x i32> @llvm.mips.addv.w(<4 x i32>, <4 x i32>) nounwind
+declare i32       @llvm.mips.copy.s.w(<4 x i32>, i32) nounwind
+
+; CHECK: test_i32:
+; CHECK: st.w {{.*}} Spill
+; CHECK: st.w {{.*}} Spill
+; CHECK: ld.w {{.*}} Reload
+; CHECK: ld.w {{.*}} Reload
+; CHECK: .size
+
+define i32 @test_i64(<2 x i64>* %p0, <2 x i64>* %q1) nounwind {
+entry:
+  %p1  = getelementptr <2 x i64>* %p0, i32 1
+  %p2  = getelementptr <2 x i64>* %p0, i32 2
+  %p3  = getelementptr <2 x i64>* %p0, i32 3
+  %p4  = getelementptr <2 x i64>* %p0, i32 4
+  %p5  = getelementptr <2 x i64>* %p0, i32 5
+  %p6  = getelementptr <2 x i64>* %p0, i32 6
+  %p7  = getelementptr <2 x i64>* %p0, i32 7
+  %p8  = getelementptr <2 x i64>* %p0, i32 8
+  %p9  = getelementptr <2 x i64>* %p0, i32 9
+  %p10 = getelementptr <2 x i64>* %p0, i32 10
+  %p11 = getelementptr <2 x i64>* %p0, i32 11
+  %p12 = getelementptr <2 x i64>* %p0, i32 12
+  %p13 = getelementptr <2 x i64>* %p0, i32 13
+  %p14 = getelementptr <2 x i64>* %p0, i32 14
+  %p15 = getelementptr <2 x i64>* %p0, i32 15
+  %p16 = getelementptr <2 x i64>* %p0, i32 16
+  %p17 = getelementptr <2 x i64>* %p0, i32 17
+  %p18 = getelementptr <2 x i64>* %p0, i32 18
+  %p19 = getelementptr <2 x i64>* %p0, i32 19
+  %p20 = getelementptr <2 x i64>* %p0, i32 20
+  %p21 = getelementptr <2 x i64>* %p0, i32 21
+  %p22 = getelementptr <2 x i64>* %p0, i32 22
+  %p23 = getelementptr <2 x i64>* %p0, i32 23
+  %p24 = getelementptr <2 x i64>* %p0, i32 24
+  %p25 = getelementptr <2 x i64>* %p0, i32 25
+  %p26 = getelementptr <2 x i64>* %p0, i32 26
+  %p27 = getelementptr <2 x i64>* %p0, i32 27
+  %p28 = getelementptr <2 x i64>* %p0, i32 28
+  %p29 = getelementptr <2 x i64>* %p0, i32 29
+  %p30 = getelementptr <2 x i64>* %p0, i32 30
+  %p31 = getelementptr <2 x i64>* %p0, i32 31
+  %p32 = getelementptr <2 x i64>* %p0, i32 32
+  %p33 = getelementptr <2 x i64>* %p0, i32 33
+  %0  = load <2 x i64>* %p0, align 16
+  %1  = load <2 x i64>* %p1, align 16
+  %2  = load <2 x i64>* %p2, align 16
+  %3  = load <2 x i64>* %p3, align 16
+  %4  = load <2 x i64>* %p4, align 16
+  %5  = load <2 x i64>* %p5, align 16
+  %6  = load <2 x i64>* %p6, align 16
+  %7  = load <2 x i64>* %p7, align 16
+  %8  = load <2 x i64>* %p8, align 16
+  %9  = load <2 x i64>* %p9, align 16
+  %10 = load <2 x i64>* %p10, align 16
+  %11 = load <2 x i64>* %p11, align 16
+  %12 = load <2 x i64>* %p12, align 16
+  %13 = load <2 x i64>* %p13, align 16
+  %14 = load <2 x i64>* %p14, align 16
+  %15 = load <2 x i64>* %p15, align 16
+  %16 = load <2 x i64>* %p16, align 16
+  %17 = load <2 x i64>* %p17, align 16
+  %18 = load <2 x i64>* %p18, align 16
+  %19 = load <2 x i64>* %p19, align 16
+  %20 = load <2 x i64>* %p20, align 16
+  %21 = load <2 x i64>* %p21, align 16
+  %22 = load <2 x i64>* %p22, align 16
+  %23 = load <2 x i64>* %p23, align 16
+  %24 = load <2 x i64>* %p24, align 16
+  %25 = load <2 x i64>* %p25, align 16
+  %26 = load <2 x i64>* %p26, align 16
+  %27 = load <2 x i64>* %p27, align 16
+  %28 = load <2 x i64>* %p28, align 16
+  %29 = load <2 x i64>* %p29, align 16
+  %30 = load <2 x i64>* %p30, align 16
+  %31 = load <2 x i64>* %p31, align 16
+  %32 = load <2 x i64>* %p32, align 16
+  %33 = load <2 x i64>* %p33, align 16
+  %r1  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0,   <2 x i64> %1)
+  %r2  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r1,  <2 x i64> %2)
+  %r3  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r2,  <2 x i64> %3)
+  %r4  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r3,  <2 x i64> %4)
+  %r5  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r4,  <2 x i64> %5)
+  %r6  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r5,  <2 x i64> %6)
+  %r7  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r6,  <2 x i64> %7)
+  %r8  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r7,  <2 x i64> %8)
+  %r9  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r8,  <2 x i64> %9)
+  %r10 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r9,  <2 x i64> %10)
+  %r11 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r10, <2 x i64> %11)
+  %r12 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r11, <2 x i64> %12)
+  %r13 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r12, <2 x i64> %13)
+  %r14 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r13, <2 x i64> %14)
+  %r15 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r14, <2 x i64> %15)
+  %r16 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r15, <2 x i64> %16)
+  %r17 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r16, <2 x i64> %17)
+  %r18 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r17, <2 x i64> %18)
+  %r19 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r18, <2 x i64> %19)
+  %r20 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r19, <2 x i64> %20)
+  %r21 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r20, <2 x i64> %21)
+  %r22 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r21, <2 x i64> %22)
+  %r23 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r22, <2 x i64> %23)
+  %r24 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r23, <2 x i64> %24)
+  %r25 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r24, <2 x i64> %25)
+  %r26 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r25, <2 x i64> %26)
+  %r27 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r26, <2 x i64> %27)
+  %r28 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r27, <2 x i64> %28)
+  %r29 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r28, <2 x i64> %29)
+  %r30 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r29, <2 x i64> %30)
+  %r31 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r30, <2 x i64> %31)
+  %r32 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r31, <2 x i64> %32)
+  %r33 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r32, <2 x i64> %33)
+  %rx1  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %r33,  <2 x i64> %1)
+  %rx2  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx1,  <2 x i64> %2)
+  %rx3  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx2,  <2 x i64> %3)
+  %rx4  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx3,  <2 x i64> %4)
+  %rx5  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx4,  <2 x i64> %5)
+  %rx6  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx5,  <2 x i64> %6)
+  %rx7  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx6,  <2 x i64> %7)
+  %rx8  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx7,  <2 x i64> %8)
+  %rx9  = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx8,  <2 x i64> %9)
+  %rx10 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx9,  <2 x i64> %10)
+  %rx11 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx10, <2 x i64> %11)
+  %rx12 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx11, <2 x i64> %12)
+  %rx13 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx12, <2 x i64> %13)
+  %rx14 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx13, <2 x i64> %14)
+  %rx15 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx14, <2 x i64> %15)
+  %rx16 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx15, <2 x i64> %16)
+  %rx17 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx16, <2 x i64> %17)
+  %rx18 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx17, <2 x i64> %18)
+  %rx19 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx18, <2 x i64> %19)
+  %rx20 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx19, <2 x i64> %20)
+  %rx21 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx20, <2 x i64> %21)
+  %rx22 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx21, <2 x i64> %22)
+  %rx23 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx22, <2 x i64> %23)
+  %rx24 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx23, <2 x i64> %24)
+  %rx25 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx24, <2 x i64> %25)
+  %rx26 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx25, <2 x i64> %26)
+  %rx27 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx26, <2 x i64> %27)
+  %rx28 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx27, <2 x i64> %28)
+  %rx29 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx28, <2 x i64> %29)
+  %rx30 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx29, <2 x i64> %30)
+  %rx31 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx30, <2 x i64> %31)
+  %rx32 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx31, <2 x i64> %32)
+  %rx33 = call <2 x i64> @llvm.mips.addv.d(<2 x i64> %rx32, <2 x i64> %33)
+  %res1 = bitcast <2 x i64> %rx33 to <4 x i32>
+  %res = call i32 @llvm.mips.copy.s.w(<4 x i32> %res1, i32 0)
+  ret i32 %res
+}
+
+declare <2 x i64> @llvm.mips.addv.d(<2 x i64>, <2 x i64>) nounwind
+
+; CHECK: test_i64:
+; CHECK: st.d {{.*}} Spill
+; CHECK: st.d {{.*}} Spill
+; CHECK: ld.d {{.*}} Reload
+; CHECK: ld.d {{.*}} Reload
+; CHECK: .size
diff --git a/test/CodeGen/Mips/msa/vec.ll b/test/CodeGen/Mips/msa/vec.ll
new file mode 100644
index 0000000..5bddf5a
--- /dev/null
+++ b/test/CodeGen/Mips/msa/vec.ll
@@ -0,0 +1,946 @@
+; Test the MSA intrinsics that are encoded with the VEC instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ANYENDIAN %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck -check-prefix=ANYENDIAN %s
+
+@llvm_mips_and_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_and_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_and_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_and_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_and_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_and_v_b_ARG2
+  %2 = bitcast <16 x i8> %0 to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.and.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* @llvm_mips_and_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_and_v_b_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: and.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_and_v_b_test
+;
+@llvm_mips_and_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_and_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_and_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_and_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_and_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_and_v_h_ARG2
+  %2 = bitcast <8 x i16> %0 to <16 x i8>
+  %3 = bitcast <8 x i16> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.and.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <8 x i16>
+  store <8 x i16> %5, <8 x i16>* @llvm_mips_and_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_and_v_h_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: and.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_and_v_h_test
+;
+@llvm_mips_and_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_and_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_and_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_and_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_and_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_and_v_w_ARG2
+  %2 = bitcast <4 x i32> %0 to <16 x i8>
+  %3 = bitcast <4 x i32> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.and.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <4 x i32>
+  store <4 x i32> %5, <4 x i32>* @llvm_mips_and_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_and_v_w_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: and.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_and_v_w_test
+;
+@llvm_mips_and_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_and_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_and_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_and_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_and_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_and_v_d_ARG2
+  %2 = bitcast <2 x i64> %0 to <16 x i8>
+  %3 = bitcast <2 x i64> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.and.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <2 x i64>
+  store <2 x i64> %5, <2 x i64>* @llvm_mips_and_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_and_v_d_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: and.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_and_v_d_test
+;
+define void @and_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_and_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_and_v_b_ARG2
+  %2 = and <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_and_v_b_RES
+  ret void
+}
+
+; CHECK: and_v_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: and.v
+; CHECK: st.b
+; CHECK: .size and_v_b_test
+;
+define void @and_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_and_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_and_v_h_ARG2
+  %2 = and <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_and_v_h_RES
+  ret void
+}
+
+; CHECK: and_v_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: and.v
+; CHECK: st.h
+; CHECK: .size and_v_h_test
+;
+
+define void @and_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_and_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_and_v_w_ARG2
+  %2 = and <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_and_v_w_RES
+  ret void
+}
+
+; CHECK: and_v_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: and.v
+; CHECK: st.w
+; CHECK: .size and_v_w_test
+;
+
+define void @and_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_and_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_and_v_d_ARG2
+  %2 = and <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_and_v_d_RES
+  ret void
+}
+
+; CHECK: and_v_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: and.v
+; CHECK: st.d
+; CHECK: .size and_v_d_test
+;
+@llvm_mips_bmnz_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmnz_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bmnz_v_b_ARG3 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmnz_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bmnz_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bmnz_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bmnz_v_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_bmnz_v_b_ARG3
+  %3 = bitcast <16 x i8> %0 to <16 x i8>
+  %4 = bitcast <16 x i8> %1 to <16 x i8>
+  %5 = bitcast <16 x i8> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmnz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <16 x i8>
+  store <16 x i8> %7, <16 x i8>* @llvm_mips_bmnz_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmnz_v_b_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnz_v_b_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnz_v_b_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmnz_v_b_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; ANYENDIAN-DAG: bmnz.v [[R4]], [[R5]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R4]], 0(
+; ANYENDIAN: .size llvm_mips_bmnz_v_b_test
+
+@llvm_mips_bmnz_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bmnz_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bmnz_v_h_ARG3 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bmnz_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bmnz_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bmnz_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bmnz_v_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_bmnz_v_h_ARG3
+  %3 = bitcast <8 x i16> %0 to <16 x i8>
+  %4 = bitcast <8 x i16> %1 to <16 x i8>
+  %5 = bitcast <8 x i16> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmnz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <8 x i16>
+  store <8 x i16> %7, <8 x i16>* @llvm_mips_bmnz_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmnz_v_h_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnz_v_h_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnz_v_h_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmnz_v_h_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; ANYENDIAN-DAG: bmnz.v [[R4]], [[R5]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R4]], 0(
+; ANYENDIAN: .size llvm_mips_bmnz_v_h_test
+
+@llvm_mips_bmnz_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bmnz_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bmnz_v_w_ARG3 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bmnz_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bmnz_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bmnz_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bmnz_v_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_bmnz_v_w_ARG3
+  %3 = bitcast <4 x i32> %0 to <16 x i8>
+  %4 = bitcast <4 x i32> %1 to <16 x i8>
+  %5 = bitcast <4 x i32> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmnz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <4 x i32>
+  store <4 x i32> %7, <4 x i32>* @llvm_mips_bmnz_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmnz_v_w_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnz_v_w_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnz_v_w_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmnz_v_w_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; ANYENDIAN-DAG: bmnz.v [[R4]], [[R5]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R4]], 0(
+; ANYENDIAN: .size llvm_mips_bmnz_v_w_test
+
+@llvm_mips_bmnz_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bmnz_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bmnz_v_d_ARG3 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bmnz_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bmnz_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bmnz_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bmnz_v_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_bmnz_v_d_ARG3
+  %3 = bitcast <2 x i64> %0 to <16 x i8>
+  %4 = bitcast <2 x i64> %1 to <16 x i8>
+  %5 = bitcast <2 x i64> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmnz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <2 x i64>
+  store <2 x i64> %7, <2 x i64>* @llvm_mips_bmnz_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmnz_v_d_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnz_v_d_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnz_v_d_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmnz_v_d_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; ANYENDIAN-DAG: bmnz.v [[R4]], [[R5]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R4]], 0(
+; ANYENDIAN: .size llvm_mips_bmnz_v_d_test
+
+@llvm_mips_bmz_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmz_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bmz_v_b_ARG3 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bmz_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bmz_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bmz_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bmz_v_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_bmz_v_b_ARG3
+  %3 = bitcast <16 x i8> %0 to <16 x i8>
+  %4 = bitcast <16 x i8> %1 to <16 x i8>
+  %5 = bitcast <16 x i8> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <16 x i8>
+  store <16 x i8> %7, <16 x i8>* @llvm_mips_bmz_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmz_v_b_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmz_v_b_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmz_v_b_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmz_v_b_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bmz.v with ws and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R4]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
+; ANYENDIAN: .size llvm_mips_bmz_v_b_test
+
+@llvm_mips_bmz_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bmz_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bmz_v_h_ARG3 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bmz_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bmz_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bmz_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bmz_v_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_bmz_v_h_ARG3
+  %3 = bitcast <8 x i16> %0 to <16 x i8>
+  %4 = bitcast <8 x i16> %1 to <16 x i8>
+  %5 = bitcast <8 x i16> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <8 x i16>
+  store <8 x i16> %7, <8 x i16>* @llvm_mips_bmz_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmz_v_h_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmz_v_h_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmz_v_h_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmz_v_h_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bmz.v with ws and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R4]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
+; ANYENDIAN: .size llvm_mips_bmz_v_h_test
+
+@llvm_mips_bmz_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bmz_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bmz_v_w_ARG3 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bmz_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bmz_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bmz_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bmz_v_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_bmz_v_w_ARG3
+  %3 = bitcast <4 x i32> %0 to <16 x i8>
+  %4 = bitcast <4 x i32> %1 to <16 x i8>
+  %5 = bitcast <4 x i32> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <4 x i32>
+  store <4 x i32> %7, <4 x i32>* @llvm_mips_bmz_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmz_v_w_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmz_v_w_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmz_v_w_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmz_v_w_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bmz.v with ws and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R4]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
+; ANYENDIAN: .size llvm_mips_bmz_v_w_test
+
+@llvm_mips_bmz_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bmz_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bmz_v_d_ARG3 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bmz_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bmz_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bmz_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bmz_v_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_bmz_v_d_ARG3
+  %3 = bitcast <2 x i64> %0 to <16 x i8>
+  %4 = bitcast <2 x i64> %1 to <16 x i8>
+  %5 = bitcast <2 x i64> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bmz.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <2 x i64>
+  store <2 x i64> %7, <2 x i64>* @llvm_mips_bmz_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bmz_v_d_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmz_v_d_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmz_v_d_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bmz_v_d_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bmz.v with ws and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R5]], [[R4]], [[R6]]
+; ANYENDIAN-DAG: st.b [[R5]], 0(
+; ANYENDIAN: .size llvm_mips_bmz_v_d_test
+
+@llvm_mips_bsel_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bsel_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_bsel_v_b_ARG3 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_bsel_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_bsel_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bsel_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_bsel_v_b_ARG2
+  %2 = load <16 x i8>* @llvm_mips_bsel_v_b_ARG3
+  %3 = bitcast <16 x i8> %0 to <16 x i8>
+  %4 = bitcast <16 x i8> %1 to <16 x i8>
+  %5 = bitcast <16 x i8> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bsel.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <16 x i8>
+  store <16 x i8> %7, <16 x i8>* @llvm_mips_bsel_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bsel_v_b_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bsel_v_b_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bsel_v_b_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bsel_v_b_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bsel.v with wt and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R6]], 0(
+; ANYENDIAN: .size llvm_mips_bsel_v_b_test
+
+@llvm_mips_bsel_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bsel_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_bsel_v_h_ARG3 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_bsel_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_bsel_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_bsel_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_bsel_v_h_ARG2
+  %2 = load <8 x i16>* @llvm_mips_bsel_v_h_ARG3
+  %3 = bitcast <8 x i16> %0 to <16 x i8>
+  %4 = bitcast <8 x i16> %1 to <16 x i8>
+  %5 = bitcast <8 x i16> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bsel.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <8 x i16>
+  store <8 x i16> %7, <8 x i16>* @llvm_mips_bsel_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bsel_v_h_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bsel_v_h_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bsel_v_h_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bsel_v_h_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bsel.v with wt and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R6]], 0(
+; ANYENDIAN: .size llvm_mips_bsel_v_h_test
+
+@llvm_mips_bsel_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bsel_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_bsel_v_w_ARG3 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_bsel_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_bsel_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_bsel_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_bsel_v_w_ARG2
+  %2 = load <4 x i32>* @llvm_mips_bsel_v_w_ARG3
+  %3 = bitcast <4 x i32> %0 to <16 x i8>
+  %4 = bitcast <4 x i32> %1 to <16 x i8>
+  %5 = bitcast <4 x i32> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bsel.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <4 x i32>
+  store <4 x i32> %7, <4 x i32>* @llvm_mips_bsel_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bsel_v_w_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bsel_v_w_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bsel_v_w_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bsel_v_w_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bsel.v with wt and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R6]], 0(
+; ANYENDIAN: .size llvm_mips_bsel_v_w_test
+
+@llvm_mips_bsel_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bsel_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_bsel_v_d_ARG3 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_bsel_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_bsel_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_bsel_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_bsel_v_d_ARG2
+  %2 = load <2 x i64>* @llvm_mips_bsel_v_d_ARG3
+  %3 = bitcast <2 x i64> %0 to <16 x i8>
+  %4 = bitcast <2 x i64> %1 to <16 x i8>
+  %5 = bitcast <2 x i64> %2 to <16 x i8>
+  %6 = tail call <16 x i8> @llvm.mips.bsel.v(<16 x i8> %3, <16 x i8> %4, <16 x i8> %5)
+  %7 = bitcast <16 x i8> %6 to <2 x i64>
+  store <2 x i64> %7, <2 x i64>* @llvm_mips_bsel_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_bsel_v_d_test:
+; ANYENDIAN-DAG: lw [[R1:\$[0-9]+]], %got(llvm_mips_bsel_v_d_ARG1)(
+; ANYENDIAN-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_bsel_v_d_ARG2)(
+; ANYENDIAN-DAG: lw [[R3:\$[0-9]+]], %got(llvm_mips_bsel_v_d_ARG3)(
+; ANYENDIAN-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; ANYENDIAN-DAG: ld.b [[R5:\$w[0-9]+]], 0([[R2]])
+; ANYENDIAN-DAG: ld.b [[R6:\$w[0-9]+]], 0([[R3]])
+; bmnz.v is the same as bsel.v with wt and wd_in swapped
+; ANYENDIAN-DAG: bmnz.v [[R6]], [[R5]], [[R4]]
+; ANYENDIAN-DAG: st.b [[R6]], 0(
+; ANYENDIAN: .size llvm_mips_bsel_v_d_test
+
+@llvm_mips_nor_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_nor_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_nor_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_nor_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_nor_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_nor_v_b_ARG2
+  %2 = bitcast <16 x i8> %0 to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.nor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* @llvm_mips_nor_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_nor_v_b_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: nor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_nor_v_b_test
+;
+@llvm_mips_nor_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_nor_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_nor_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_nor_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_nor_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_nor_v_h_ARG2
+  %2 = bitcast <8 x i16> %0 to <16 x i8>
+  %3 = bitcast <8 x i16> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.nor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <8 x i16>
+  store <8 x i16> %5, <8 x i16>* @llvm_mips_nor_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_nor_v_h_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: nor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_nor_v_h_test
+;
+@llvm_mips_nor_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_nor_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_nor_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_nor_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_nor_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_nor_v_w_ARG2
+  %2 = bitcast <4 x i32> %0 to <16 x i8>
+  %3 = bitcast <4 x i32> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.nor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <4 x i32>
+  store <4 x i32> %5, <4 x i32>* @llvm_mips_nor_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_nor_v_w_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: nor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_nor_v_w_test
+;
+@llvm_mips_nor_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_nor_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_nor_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_nor_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_nor_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_nor_v_d_ARG2
+  %2 = bitcast <2 x i64> %0 to <16 x i8>
+  %3 = bitcast <2 x i64> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.nor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <2 x i64>
+  store <2 x i64> %5, <2 x i64>* @llvm_mips_nor_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_nor_v_d_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: nor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_nor_v_d_test
+;
+@llvm_mips_or_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_or_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_or_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_or_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_or_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_or_v_b_ARG2
+  %2 = bitcast <16 x i8> %0 to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.or.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* @llvm_mips_or_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_or_v_b_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: or.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_or_v_b_test
+;
+@llvm_mips_or_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_or_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_or_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_or_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_or_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_or_v_h_ARG2
+  %2 = bitcast <8 x i16> %0 to <16 x i8>
+  %3 = bitcast <8 x i16> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.or.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <8 x i16>
+  store <8 x i16> %5, <8 x i16>* @llvm_mips_or_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_or_v_h_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: or.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_or_v_h_test
+;
+@llvm_mips_or_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_or_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_or_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_or_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_or_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_or_v_w_ARG2
+  %2 = bitcast <4 x i32> %0 to <16 x i8>
+  %3 = bitcast <4 x i32> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.or.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <4 x i32>
+  store <4 x i32> %5, <4 x i32>* @llvm_mips_or_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_or_v_w_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: or.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_or_v_w_test
+;
+@llvm_mips_or_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_or_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_or_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_or_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_or_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_or_v_d_ARG2
+  %2 = bitcast <2 x i64> %0 to <16 x i8>
+  %3 = bitcast <2 x i64> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.or.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <2 x i64>
+  store <2 x i64> %5, <2 x i64>* @llvm_mips_or_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_or_v_d_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: or.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_or_v_d_test
+;
+define void @or_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_or_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_or_v_b_ARG2
+  %2 = or <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_or_v_b_RES
+  ret void
+}
+
+; CHECK: or_v_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: or.v
+; CHECK: st.b
+; CHECK: .size or_v_b_test
+;
+define void @or_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_or_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_or_v_h_ARG2
+  %2 = or <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_or_v_h_RES
+  ret void
+}
+
+; CHECK: or_v_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: or.v
+; CHECK: st.h
+; CHECK: .size or_v_h_test
+;
+
+define void @or_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_or_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_or_v_w_ARG2
+  %2 = or <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_or_v_w_RES
+  ret void
+}
+
+; CHECK: or_v_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: or.v
+; CHECK: st.w
+; CHECK: .size or_v_w_test
+;
+
+define void @or_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_or_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_or_v_d_ARG2
+  %2 = or <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_or_v_d_RES
+  ret void
+}
+
+; CHECK: or_v_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: or.v
+; CHECK: st.d
+; CHECK: .size or_v_d_test
+;
+@llvm_mips_xor_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+@llvm_mips_xor_v_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
+@llvm_mips_xor_v_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
+
+define void @llvm_mips_xor_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_xor_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_xor_v_b_ARG2
+  %2 = bitcast <16 x i8> %0 to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.xor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <16 x i8>
+  store <16 x i8> %5, <16 x i8>* @llvm_mips_xor_v_b_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_xor_v_b_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_xor_v_b_test
+;
+@llvm_mips_xor_v_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
+@llvm_mips_xor_v_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
+@llvm_mips_xor_v_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
+
+define void @llvm_mips_xor_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_xor_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_xor_v_h_ARG2
+  %2 = bitcast <8 x i16> %0 to <16 x i8>
+  %3 = bitcast <8 x i16> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.xor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <8 x i16>
+  store <8 x i16> %5, <8 x i16>* @llvm_mips_xor_v_h_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_xor_v_h_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_xor_v_h_test
+;
+@llvm_mips_xor_v_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
+@llvm_mips_xor_v_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
+@llvm_mips_xor_v_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
+
+define void @llvm_mips_xor_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_xor_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_xor_v_w_ARG2
+  %2 = bitcast <4 x i32> %0 to <16 x i8>
+  %3 = bitcast <4 x i32> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.xor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <4 x i32>
+  store <4 x i32> %5, <4 x i32>* @llvm_mips_xor_v_w_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_xor_v_w_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_xor_v_w_test
+;
+@llvm_mips_xor_v_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
+@llvm_mips_xor_v_d_ARG2 = global <2 x i64> <i64 2, i64 3>, align 16
+@llvm_mips_xor_v_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
+
+define void @llvm_mips_xor_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_xor_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_xor_v_d_ARG2
+  %2 = bitcast <2 x i64> %0 to <16 x i8>
+  %3 = bitcast <2 x i64> %1 to <16 x i8>
+  %4 = tail call <16 x i8> @llvm.mips.xor.v(<16 x i8> %2, <16 x i8> %3)
+  %5 = bitcast <16 x i8> %4 to <2 x i64>
+  store <2 x i64> %5, <2 x i64>* @llvm_mips_xor_v_d_RES
+  ret void
+}
+
+; ANYENDIAN: llvm_mips_xor_v_d_test:
+; ANYENDIAN: ld.b
+; ANYENDIAN: ld.b
+; ANYENDIAN: xor.v
+; ANYENDIAN: st.b
+; ANYENDIAN: .size llvm_mips_xor_v_d_test
+;
+define void @xor_v_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_xor_v_b_ARG1
+  %1 = load <16 x i8>* @llvm_mips_xor_v_b_ARG2
+  %2 = xor <16 x i8> %0, %1
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_xor_v_b_RES
+  ret void
+}
+
+; CHECK: xor_v_b_test:
+; CHECK: ld.b
+; CHECK: ld.b
+; CHECK: xor.v
+; CHECK: st.b
+; CHECK: .size xor_v_b_test
+;
+define void @xor_v_h_test() nounwind {
+entry:
+  %0 = load <8 x i16>* @llvm_mips_xor_v_h_ARG1
+  %1 = load <8 x i16>* @llvm_mips_xor_v_h_ARG2
+  %2 = xor <8 x i16> %0, %1
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_xor_v_h_RES
+  ret void
+}
+
+; CHECK: xor_v_h_test:
+; CHECK: ld.h
+; CHECK: ld.h
+; CHECK: xor.v
+; CHECK: st.h
+; CHECK: .size xor_v_h_test
+;
+
+define void @xor_v_w_test() nounwind {
+entry:
+  %0 = load <4 x i32>* @llvm_mips_xor_v_w_ARG1
+  %1 = load <4 x i32>* @llvm_mips_xor_v_w_ARG2
+  %2 = xor <4 x i32> %0, %1
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_xor_v_w_RES
+  ret void
+}
+
+; CHECK: xor_v_w_test:
+; CHECK: ld.w
+; CHECK: ld.w
+; CHECK: xor.v
+; CHECK: st.w
+; CHECK: .size xor_v_w_test
+;
+
+define void @xor_v_d_test() nounwind {
+entry:
+  %0 = load <2 x i64>* @llvm_mips_xor_v_d_ARG1
+  %1 = load <2 x i64>* @llvm_mips_xor_v_d_ARG2
+  %2 = xor <2 x i64> %0, %1
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_xor_v_d_RES
+  ret void
+}
+
+; CHECK: xor_v_d_test:
+; CHECK: ld.d
+; CHECK: ld.d
+; CHECK: xor.v
+; CHECK: st.d
+; CHECK: .size xor_v_d_test
+;
+declare <16 x i8> @llvm.mips.and.v(<16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.bmnz.v(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.bmz.v(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.bsel.v(<16 x i8>, <16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.nor.v(<16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.or.v(<16 x i8>, <16 x i8>) nounwind
+declare <16 x i8> @llvm.mips.xor.v(<16 x i8>, <16 x i8>) nounwind
diff --git a/test/CodeGen/Mips/msa/vecs10.ll b/test/CodeGen/Mips/msa/vecs10.ll
new file mode 100644
index 0000000..e22e075
--- /dev/null
+++ b/test/CodeGen/Mips/msa/vecs10.ll
@@ -0,0 +1,47 @@
+; Test the MSA intrinsics that are encoded with the VECS10 instruction format.
+
+; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
+
+@llvm_mips_bnz_v_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+
+define i32 @llvm_mips_bnz_v_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bnz_v_ARG1
+  %1 = tail call i32 @llvm.mips.bnz.v(<16 x i8> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bnz.v(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_bnz_v_test:
+; CHECK-DAG: ld.b [[R0:\$w[0-9]+]]
+; CHECK-DAG: bnz.v [[R0]]
+; CHECK: .size llvm_mips_bnz_v_test
+
+@llvm_mips_bz_v_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
+
+define i32 @llvm_mips_bz_v_test() nounwind {
+entry:
+  %0 = load <16 x i8>* @llvm_mips_bz_v_ARG1
+  %1 = tail call i32 @llvm.mips.bz.v(<16 x i8> %0)
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %true, label %false
+true:
+  ret i32 2
+false:
+  ret i32 3
+}
+
+declare i32 @llvm.mips.bz.v(<16 x i8>) nounwind
+
+; CHECK: llvm_mips_bz_v_test:
+; CHECK-DAG: ld.b [[R0:\$w[0-9]+]]
+; CHECK-DAG: bz.v [[R0]]
+; CHECK: .size llvm_mips_bz_v_test
+;
diff --git a/test/CodeGen/Mips/nomips16.ll b/test/CodeGen/Mips/nomips16.ll
new file mode 100644
index 0000000..bf7c667
--- /dev/null
+++ b/test/CodeGen/Mips/nomips16.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s 
+
+@x = global float 0.000000e+00, align 4
+@.str = private unnamed_addr constant [20 x i8] c"in main: mips16 %f\0A\00", align 1
+
+; Function Attrs: nounwind
+define void @foo() #0 {
+entry:
+  %0 = load float* @x, align 4
+  %conv = fpext float %0 to double
+  %add = fadd double %conv, 1.500000e+00
+  %conv1 = fptrunc double %add to float
+  store float %conv1, float* @x, align 4
+  ret void
+}
+; CHECK: 	.ent	foo
+; CHECK: 	jal	__mips16_extendsfdf2
+; CHECK:   	.end	foo
+
+; Function Attrs: nounwind
+define void @nofoo() #1 {
+entry:
+  %0 = load float* @x, align 4
+  %conv = fpext float %0 to double
+  %add = fadd double %conv, 3.900000e+00
+  %conv1 = fptrunc double %add to float
+  store float %conv1, float* @x, align 4
+  ret void
+}
+
+; CHECK: 	.ent	nofoo
+; CHECK: 	cvt.d.s	$f{{.+}}, $f{{.+}}
+; CHECK: 	.end	nofoo
+
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
diff --git a/test/CodeGen/Mips/o32_cc.ll b/test/CodeGen/Mips/o32_cc.ll
index 70b66ef..08e5aab 100644
--- a/test/CodeGen/Mips/o32_cc.ll
+++ b/test/CodeGen/Mips/o32_cc.ll
@@ -1,11 +1,12 @@
-; RUN: llc -march=mips < %s | FileCheck %s
-
-; FIXME: Disabled because it unpredictably fails on certain platforms.
-; REQUIRES: disabled
+; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s
+; RUN: llc -march=mipsel < %s | FileCheck -check-prefix=FP32EL %s
+; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck -check-prefix=FP64EL %s
 
 ; $f12, $f14
-; CHECK: ldc1 $f12, %lo
-; CHECK: ldc1 $f14, %lo
+; CHECK-LABEL: testlowercall0:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: ldc1 $f14, %lo
 define void @testlowercall0() nounwind {
 entry:
   tail call void @f0(double 5.000000e+00, double 6.000000e+00) nounwind
@@ -15,8 +16,9 @@ entry:
 declare void @f0(double, double)
 
 ; $f12, $f14
-; CHECK: lwc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
+; CHECK-LABEL: testlowercall1:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
 define void @testlowercall1() nounwind {
 entry:
   tail call void @f1(float 8.000000e+00, float 9.000000e+00) nounwind
@@ -26,8 +28,9 @@ entry:
 declare void @f1(float, float)
 
 ; $f12, $f14
-; CHECK: lwc1 $f12, %lo
-; CHECK: ldc1 $f14, %lo
+; CHECK-LABEL: testlowercall2:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: ldc1 $f14, %lo
 define void @testlowercall2() nounwind {
 entry:
   tail call void @f2(float 8.000000e+00, double 6.000000e+00) nounwind
@@ -37,8 +40,9 @@ entry:
 declare void @f2(float, double)
 
 ; $f12, $f14
-; CHECK: ldc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
+; CHECK-LABEL: testlowercall3:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
 define void @testlowercall3() nounwind {
 entry:
   tail call void @f3(double 5.000000e+00, float 9.000000e+00) nounwind
@@ -48,10 +52,11 @@ entry:
 declare void @f3(double, float)
 
 ; $4, $5, $6, $7
-; CHECK: addiu $4, $zero, 12
-; CHECK: addiu $5, $zero, 13
-; CHECK: addiu $6, $zero, 14
-; CHECK: addiu $7, $zero, 15
+; CHECK-LABEL: testlowercall4:
+; CHECK-DAG: addiu $4, $zero, 12
+; CHECK-DAG: addiu $5, $zero, 13
+; CHECK-DAG: addiu $6, $zero, 14
+; CHECK-DAG: addiu $7, $zero, 15
 define void @testlowercall4() nounwind {
 entry:
   tail call void @f4(i32 12, i32 13, i32 14, i32 15) nounwind
@@ -61,10 +66,11 @@ entry:
 declare void @f4(i32, i32, i32, i32)
 
 ; $f12, $6, stack
-; CHECK: sw
-; CHECK: sw
-; CHECK: ldc1 $f12, %lo
-; CHECK: addiu $6, $zero, 23
+; CHECK-LABEL: testlowercall5:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: addiu $6, $zero, 23
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall5() nounwind {
 entry:
   tail call void @f5(double 1.500000e+01, i32 23, double 1.700000e+01) nounwind
@@ -74,9 +80,10 @@ entry:
 declare void @f5(double, i32, double)
 
 ; $f12, $6, $7
-; CHECK: ldc1 $f12, %lo
-; CHECK: addiu $6, $zero, 33
-; CHECK: addiu $7, $zero, 24
+; CHECK-LABEL: testlowercall6:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: addiu $6, $zero, 33
+; CHECK-DAG: addiu $7, $zero, 24
 define void @testlowercall6() nounwind {
 entry:
   tail call void @f6(double 2.500000e+01, i32 33, i32 24) nounwind
@@ -86,9 +93,10 @@ entry:
 declare void @f6(double, i32, i32)
 
 ; $f12, $5, $6
-; CHECK: lwc1 $f12, %lo
-; CHECK: addiu $5, $zero, 43
-; CHECK: addiu $6, $zero, 34
+; CHECK-LABEL: testlowercall7:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: addiu $5, $zero, 43
+; CHECK-DAG: addiu $6, $zero, 34
 define void @testlowercall7() nounwind {
 entry:
   tail call void @f7(float 1.800000e+01, i32 43, i32 34) nounwind
@@ -98,11 +106,12 @@ entry:
 declare void @f7(float, i32, i32)
 
 ; $4, $5, $6, stack
-; CHECK: sw
-; CHECK: sw
-; CHECK: addiu $4, $zero, 22
-; CHECK: addiu $5, $zero, 53
-; CHECK: addiu $6, $zero, 44
+; CHECK-LABEL: testlowercall8:
+; CHECK-DAG: addiu $4, $zero, 22
+; CHECK-DAG: addiu $5, $zero, 53
+; CHECK-DAG: addiu $6, $zero, 44
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall8() nounwind {
 entry:
   tail call void @f8(i32 22, i32 53, i32 44, double 4.000000e+00) nounwind
@@ -112,10 +121,11 @@ entry:
 declare void @f8(i32, i32, i32, double)
 
 ; $4, $5, $6, $7
-; CHECK: addiu $4, $zero, 32
-; CHECK: addiu $5, $zero, 63
-; CHECK: addiu $6, $zero, 54
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall9:
+; CHECK-DAG: addiu $4, $zero, 32
+; CHECK-DAG: addiu $5, $zero, 63
+; CHECK-DAG: addiu $6, $zero, 54
+; CHECK-DAG: lui $7, 16688
 define void @testlowercall9() nounwind {
 entry:
   tail call void @f9(i32 32, i32 63, i32 54, float 1.100000e+01) nounwind
@@ -125,10 +135,15 @@ entry:
 declare void @f9(i32, i32, i32, float)
 
 ; $4, $5, ($6, $7)
-; CHECK: addiu $4, $zero, 42
-; CHECK: addiu $5, $zero, 73
-; CHECK: addiu $6, $zero, 0
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall10:
+; CHECK-DAG: addiu $4, $zero, 42
+; CHECK-DAG: addiu $5, $zero, 73
+; FP32EL-LABEL: testlowercall10:
+; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
+; FP64EL-LABEL: testlowercall10:
+; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall10() nounwind {
 entry:
   tail call void @f10(i32 42, i32 73, double 2.700000e+01) nounwind
@@ -138,9 +153,14 @@ entry:
 declare void @f10(i32, i32, double)
 
 ; $4, ($6, $7)
-; CHECK: addiu $4, $zero, 52
-; CHECK: addiu $6, $zero, 0
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall11:
+; CHECK-DAG: addiu $4, $zero, 52
+; FP32EL-LABEL: testlowercall11:
+; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
+; FP64EL-LABEL: testlowercall11:
+; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall11() nounwind {
 entry:
   tail call void @f11(i32 52, double 1.600000e+01) nounwind
@@ -150,10 +170,11 @@ entry:
 declare void @f11(i32, double)
 
 ; $f12, $f14, $6, $7
-; CHECK: lwc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
-; CHECK: ori $6
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall12:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
+; CHECK-DAG: lui $6, 16672
+; CHECK-DAG: lui $7, 16808
 define void @testlowercall12() nounwind {
 entry:
   tail call void @f12(float 2.800000e+01, float 1.900000e+01, float 1.000000e+01, float 2.100000e+01) nounwind
@@ -163,10 +184,11 @@ entry:
 declare void @f12(float, float, float, float)
 
 ; $f12, $5, $6, $7
-; CHECK: lwc1 $f12, %lo
-; CHECK: addiu $5, $zero, 83
-; CHECK: ori $6
-; CHECK: addiu $7, $zero, 25
+; CHECK-LABEL: testlowercall13:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: addiu $5, $zero, 83
+; CHECK-DAG: lui $6, 16800
+; CHECK-DAG: addiu $7, $zero, 25
 define void @testlowercall13() nounwind {
 entry:
   tail call void @f13(float 3.800000e+01, i32 83, float 2.000000e+01, i32 25) nounwind
@@ -177,9 +199,10 @@ entry:
 declare void @f13(float, i32, float, i32)
 
 ; $f12, $f14, $7
-; CHECK: ldc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall14:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
+; CHECK-DAG: lui $7, 16880
 define void @testlowercall14() nounwind {
 entry:
   tail call void @f14(double 3.500000e+01, float 2.900000e+01, float 3.000000e+01) nounwind
@@ -189,10 +212,15 @@ entry:
 declare void @f14(double, float, float)
 
 ; $f12, $f14, ($6, $7)
-; CHECK: lwc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
-; CHECK: addiu $6, $zero, 0
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall15:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
+; FP32EL-LABEL: testlowercall15:
+; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
+; FP64EL-LABEL: testlowercall15:
+; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall15() nounwind {
 entry:
   tail call void @f15(float 4.800000e+01, float 3.900000e+01, double 3.700000e+01) nounwind
@@ -202,10 +230,11 @@ entry:
 declare void @f15(float, float, double)
 
 ; $4, $5, $6, $7
-; CHECK: addiu $4, $zero, 62
-; CHECK: ori $5
-; CHECK: addiu $6, $zero, 64
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall16:
+; CHECK-DAG: addiu $4, $zero, 62
+; CHECK-DAG: lui $5, 16964
+; CHECK-DAG: addiu $6, $zero, 64
+; CHECK-DAG: lui $7, 16888
 define void @testlowercall16() nounwind {
 entry:
   tail call void @f16(i32 62, float 4.900000e+01, i32 64, float 3.100000e+01) nounwind
@@ -215,10 +244,11 @@ entry:
 declare void @f16(i32, float, i32, float)
 
 ; $4, $5, $6, $7
-; CHECK: addiu $4, $zero, 72
-; CHECK: ori $5
-; CHECK: addiu $6, $zero, 74
-; CHECK: addiu $7, $zero, 35
+; CHECK-LABEL: testlowercall17:
+; CHECK-DAG: addiu $4, $zero, 72
+; CHECK-DAG: lui $5, 17004
+; CHECK-DAG: addiu $6, $zero, 74
+; CHECK-DAG: addiu $7, $zero, 35
 define void @testlowercall17() nounwind {
 entry:
   tail call void @f17(i32 72, float 5.900000e+01, i32 74, i32 35) nounwind
@@ -228,10 +258,11 @@ entry:
 declare void @f17(i32, float, i32, i32)
 
 ; $4, $5, $6, $7
-; CHECK: addiu $4, $zero, 82
-; CHECK: addiu $5, $zero, 93
-; CHECK: ori $6
-; CHECK: addiu $7, $zero, 45
+; CHECK-LABEL: testlowercall18:
+; CHECK-DAG: addiu $4, $zero, 82
+; CHECK-DAG: addiu $5, $zero, 93
+; CHECK-DAG: lui $6, 16928
+; CHECK-DAG: addiu $7, $zero, 45
 define void @testlowercall18() nounwind {
 entry:
   tail call void @f18(i32 82, i32 93, float 4.000000e+01, i32 45) nounwind
@@ -242,11 +273,16 @@ declare void @f18(i32, i32, float, i32)
 
 
 ; $4, ($6, $7), stack
-; CHECK: sw
-; CHECK: sw
-; CHECK: addiu $4, $zero, 92
-; CHECK: addiu $6, $zero, 0
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall20:
+; CHECK-DAG: addiu $4, $zero, 92
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
+; FP32EL-LABEL: testlowercall20:
+; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
+; FP64EL-LABEL: testlowercall20:
+; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall20() nounwind {
 entry:
   tail call void @f20(i32 92, double 2.600000e+01, double 4.700000e+01) nounwind
@@ -256,8 +292,9 @@ entry:
 declare void @f20(i32, double, double)
 
 ; $f12, $5
-; CHECK: lwc1 $f12, %lo
-; CHECK: addiu $5, $zero, 103
+; CHECK-LABEL: testlowercall21:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: addiu $5, $zero, 103
 define void @testlowercall21() nounwind {
 entry:
   tail call void @f21(float 5.800000e+01, i32 103) nounwind
@@ -267,10 +304,15 @@ entry:
 declare void @f21(float, i32)
 
 ; $f12, $5, ($6, $7)
-; CHECK: lwc1 $f12, %lo
-; CHECK: addiu $5, $zero, 113
-; CHECK: addiu $6, $zero, 0
-; CHECK: ori $7
+; CHECK-LABEL: testlowercall22:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: addiu $5, $zero, 113
+; FP32EL-LABEL: testlowercall22:
+; FP32EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP32EL-DAG: mfc1 $7, $f{{[0-9]+}}
+; FP64EL-LABEL: testlowercall22:
+; FP64EL-DAG: mfc1 $6, $f{{[0-9]+}}
+; FP64EL-DAG: mfhc1 $7, $f{{[0-9]+}}
 define void @testlowercall22() nounwind {
 entry:
   tail call void @f22(float 6.800000e+01, i32 113, double 5.700000e+01) nounwind
@@ -280,8 +322,9 @@ entry:
 declare void @f22(float, i32, double)
 
 ; $f12, f6
-; CHECK: ldc1 $f12, %lo
-; CHECK: addiu $6, $zero, 123
+; CHECK-LABEL: testlowercall23:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: addiu $6, $zero, 123
 define void @testlowercall23() nounwind {
 entry:
   tail call void @f23(double 4.500000e+01, i32 123) nounwind
@@ -291,10 +334,11 @@ entry:
 declare void @f23(double, i32)
 
 ; $f12,$6, stack
-; CHECK: sw
-; CHECK: sw
-; CHECK: ldc1 $f12, %lo
-; CHECK: addiu $6, $zero, 133
+; CHECK-LABEL: testlowercall24:
+; CHECK-DAG: ldc1 $f12, %lo
+; CHECK-DAG: addiu $6, $zero, 133
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 16($sp)
+; CHECK-DAG: sw ${{[a-z0-9]+}}, 20($sp)
 define void @testlowercall24() nounwind {
 entry:
   tail call void @f24(double 5.500000e+01, i32 133, double 6.700000e+01) nounwind
@@ -303,19 +347,19 @@ entry:
 
 declare void @f24(double, i32, double)
 
-; CHECK: lwc1 $f12, %lo
-; lwc1 $f12, %lo
-; CHECK: lwc1 $f14, %lo
-; CHECK: ori $6
-; CHECK: ori $7
-; CHECK: lwc1 $f12, %lo
-; CHECK: addiu $5, $zero, 83
-; CHECK: ori $6
-; CHECK: addiu $7, $zero, 25
-; CHECK: addiu $4, $zero, 82
-; CHECK: addiu $5, $zero, 93
-; CHECK: ori $6
-; CHECK: addiu $7, $zero, 45
+; CHECK-LABEL: testlowercall25:
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: lwc1 $f14, %lo
+; CHECK-DAG: lui $6
+; CHECK-DAG: lui $7
+; CHECK-DAG: lwc1 $f12, %lo
+; CHECK-DAG: addiu $5, $zero, 83
+; CHECK-DAG: lui $6
+; CHECK-DAG: addiu $7, $zero, 25
+; CHECK-DAG: addiu $4, $zero, 82
+; CHECK-DAG: addiu $5, $zero, 93
+; CHECK-DAG: lui $6
+; CHECK-DAG: addiu $7, $zero, 45
 define void @testlowercall25() nounwind {
 entry:
   tail call void @f12(float 2.800000e+01, float 1.900000e+01, float 1.000000e+01, float 2.100000e+01) nounwind
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index 0a8f85f..5db47ac 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -10,22 +10,23 @@
 
 define void @f1() nounwind {
 entry:
-; CHECK: lw  $[[R1:[0-9]+]], %got(f1.s1)
-; CHECK: addiu $[[R0:[0-9]+]], $[[R1]], %lo(f1.s1)
-; CHECK: lw  $[[R7:[0-9]+]], 12($[[R0]])
-; CHECK: lw  $[[R3:[0-9]+]], 16($[[R0]])
-; CHECK: lw  $[[R4:[0-9]+]], 20($[[R0]])
-; CHECK: lw  $[[R5:[0-9]+]], 24($[[R0]])
-; CHECK: lw  $[[R6:[0-9]+]], 28($[[R0]])
-; CHECK: sw  $[[R6]], 36($sp)
-; CHECK: sw  $[[R5]], 32($sp)
-; CHECK: sw  $[[R4]], 28($sp)
-; CHECK: sw  $[[R3]], 24($sp)
-; CHECK: sw  $[[R7]], 20($sp)
-; CHECK: lw  $[[R2:[0-9]+]], 8($[[R0]])
-; CHECK: sw  $[[R2]], 16($sp)
-; CHECK: lw  $6, %lo(f1.s1)($[[R1]])
-; CHECK: lw  $7, 4($[[R0]])
+; CHECK-LABEL: f1:
+; CHECK-DAG: lw  $[[R1:[0-9]+]], %got(f1.s1)
+; CHECK-DAG: addiu $[[R0:[0-9]+]], $[[R1]], %lo(f1.s1)
+; CHECK-DAG: lw  $[[R7:[0-9]+]], 12($[[R0]])
+; CHECK-DAG: lw  $[[R3:[0-9]+]], 16($[[R0]])
+; CHECK-DAG: lw  $[[R4:[0-9]+]], 20($[[R0]])
+; CHECK-DAG: lw  $[[R5:[0-9]+]], 24($[[R0]])
+; CHECK-DAG: lw  $[[R6:[0-9]+]], 28($[[R0]])
+; CHECK-DAG: sw  $[[R6]], 36($sp)
+; CHECK-DAG: sw  $[[R5]], 32($sp)
+; CHECK-DAG: sw  $[[R4]], 28($sp)
+; CHECK-DAG: sw  $[[R3]], 24($sp)
+; CHECK-DAG: sw  $[[R7]], 20($sp)
+; CHECK-DAG: lw  $[[R2:[0-9]+]], 8($[[R0]])
+; CHECK-DAG: sw  $[[R2]], 16($sp)
+; CHECK-DAG: lw  $6, %lo(f1.s1)($[[R1]])
+; CHECK-DAG: lw  $7, 4($[[R0]])
   %agg.tmp10 = alloca %struct.S3, align 4
   call void @callee1(float 2.000000e+01, %struct.S1* byval bitcast (%0* @f1.s1 to %struct.S1*)) nounwind
   call void @callee2(%struct.S2* byval @f1.s2) nounwind
@@ -61,17 +62,17 @@ entry:
 ; CHECK: mfc1 $6, $f[[F0]]
 
   %i2 = getelementptr inbounds %struct.S1* %s1, i32 0, i32 5
-  %tmp = load i32* %i2, align 4, !tbaa !0
+  %tmp = load i32* %i2, align 4
   %d = getelementptr inbounds %struct.S1* %s1, i32 0, i32 4
-  %tmp1 = load double* %d, align 8, !tbaa !3
+  %tmp1 = load double* %d, align 8
   %ll = getelementptr inbounds %struct.S1* %s1, i32 0, i32 3
-  %tmp2 = load i64* %ll, align 8, !tbaa !4
+  %tmp2 = load i64* %ll, align 8
   %i = getelementptr inbounds %struct.S1* %s1, i32 0, i32 2
-  %tmp3 = load i32* %i, align 4, !tbaa !0
+  %tmp3 = load i32* %i, align 4
   %s = getelementptr inbounds %struct.S1* %s1, i32 0, i32 1
-  %tmp4 = load i16* %s, align 2, !tbaa !5
+  %tmp4 = load i16* %s, align 2
   %c = getelementptr inbounds %struct.S1* %s1, i32 0, i32 0
-  %tmp5 = load i8* %c, align 1, !tbaa !1
+  %tmp5 = load i8* %c, align 1
   tail call void @callee4(i32 %tmp, double %tmp1, i64 %tmp2, i32 %tmp3, i16 signext %tmp4, i8 signext %tmp5, float %f) nounwind
   ret void
 }
@@ -90,9 +91,9 @@ entry:
 ; CHECK: sw  $[[R0]], 24($sp)
 
   %arrayidx = getelementptr inbounds %struct.S2* %s2, i32 0, i32 0, i32 0
-  %tmp = load i32* %arrayidx, align 4, !tbaa !0
+  %tmp = load i32* %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds %struct.S2* %s2, i32 0, i32 0, i32 3
-  %tmp3 = load i32* %arrayidx2, align 4, !tbaa !0
+  %tmp3 = load i32* %arrayidx2, align 4
   tail call void @callee4(i32 %tmp, double 2.000000e+00, i64 3, i32 %tmp3, i16 signext 4, i8 signext 5, float 6.000000e+00) nounwind
   ret void
 }
@@ -110,11 +111,11 @@ entry:
 ; CHECK: sw  $[[R1]], 24($sp)
 
   %i = getelementptr inbounds %struct.S1* %s1, i32 0, i32 2
-  %tmp = load i32* %i, align 4, !tbaa !0
+  %tmp = load i32* %i, align 4
   %i2 = getelementptr inbounds %struct.S1* %s1, i32 0, i32 5
-  %tmp1 = load i32* %i2, align 4, !tbaa !0
+  %tmp1 = load i32* %i2, align 4
   %c = getelementptr inbounds %struct.S3* %s3, i32 0, i32 0
-  %tmp2 = load i8* %c, align 1, !tbaa !1
+  %tmp2 = load i8* %c, align 1
   tail call void @callee4(i32 %tmp, double 2.000000e+00, i64 3, i32 %tmp1, i16 signext 4, i8 signext %tmp2, float 6.000000e+00) nounwind
   ret void
 }
@@ -128,10 +129,3 @@ entry:
 }
 
 declare void @f6(%struct.S4* nocapture byval, i64)
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"double", metadata !1}
-!4 = metadata !{metadata !"long long", metadata !1}
-!5 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/Mips/powif64_16.ll b/test/CodeGen/Mips/powif64_16.ll
new file mode 100644
index 0000000..35a7ca9
--- /dev/null
+++ b/test/CodeGen/Mips/powif64_16.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s 
+
+declare float     @llvm.powi.f32(float  %Val, i32 %power)
+declare double    @llvm.powi.f64(double %Val, i32 %power)
+
+define float @foo_pow_f32(float %y, i32 %p)  {
+  %1 = tail call float @llvm.powi.f32(float %y, i32 %p)
+; CHECK-NOT: .ent	__call_stub_fp_llvm.powi.f32
+; CHECK-NOT: {{.*}} jal llvm.powi.f32
+  ret float %1
+} 
+
+define double @foo_pow_f64(double %y, i32 %p)  {
+  %1 = tail call double @llvm.powi.f64(double %y, i32 %p)
+; CHECK-NOT: .ent	__call_stub_fp_llvm.powi.f64
+; CHECK-NOT: {{.*}} jal llvm.powi.f64 
+  ret double %1
+} 
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { nounwind readonly }
+
+!0 = metadata !{metadata !"double", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"int", metadata !1}
diff --git a/test/CodeGen/Mips/ra-allocatable.ll b/test/CodeGen/Mips/ra-allocatable.ll
index 7621788..afc5cb0 100644
--- a/test/CodeGen/Mips/ra-allocatable.ll
+++ b/test/CodeGen/Mips/ra-allocatable.ll
@@ -98,191 +98,186 @@ entry:
 ; CHECK: lw  $ra, {{[0-9]+}}($sp)            # 4-byte Folded Reload
 ; CHECK: jr  $ra
 
-  %0 = load i32* @a0, align 4, !tbaa !0
-  %1 = load i32** @b0, align 4, !tbaa !3
-  store i32 %0, i32* %1, align 4, !tbaa !0
-  %2 = load i32* @a1, align 4, !tbaa !0
-  %3 = load i32** @b1, align 4, !tbaa !3
-  store i32 %2, i32* %3, align 4, !tbaa !0
-  %4 = load i32* @a2, align 4, !tbaa !0
-  %5 = load i32** @b2, align 4, !tbaa !3
-  store i32 %4, i32* %5, align 4, !tbaa !0
-  %6 = load i32* @a3, align 4, !tbaa !0
-  %7 = load i32** @b3, align 4, !tbaa !3
-  store i32 %6, i32* %7, align 4, !tbaa !0
-  %8 = load i32* @a4, align 4, !tbaa !0
-  %9 = load i32** @b4, align 4, !tbaa !3
-  store i32 %8, i32* %9, align 4, !tbaa !0
-  %10 = load i32* @a5, align 4, !tbaa !0
-  %11 = load i32** @b5, align 4, !tbaa !3
-  store i32 %10, i32* %11, align 4, !tbaa !0
-  %12 = load i32* @a6, align 4, !tbaa !0
-  %13 = load i32** @b6, align 4, !tbaa !3
-  store i32 %12, i32* %13, align 4, !tbaa !0
-  %14 = load i32* @a7, align 4, !tbaa !0
-  %15 = load i32** @b7, align 4, !tbaa !3
-  store i32 %14, i32* %15, align 4, !tbaa !0
-  %16 = load i32* @a8, align 4, !tbaa !0
-  %17 = load i32** @b8, align 4, !tbaa !3
-  store i32 %16, i32* %17, align 4, !tbaa !0
-  %18 = load i32* @a9, align 4, !tbaa !0
-  %19 = load i32** @b9, align 4, !tbaa !3
-  store i32 %18, i32* %19, align 4, !tbaa !0
-  %20 = load i32* @a10, align 4, !tbaa !0
-  %21 = load i32** @b10, align 4, !tbaa !3
-  store i32 %20, i32* %21, align 4, !tbaa !0
-  %22 = load i32* @a11, align 4, !tbaa !0
-  %23 = load i32** @b11, align 4, !tbaa !3
-  store i32 %22, i32* %23, align 4, !tbaa !0
-  %24 = load i32* @a12, align 4, !tbaa !0
-  %25 = load i32** @b12, align 4, !tbaa !3
-  store i32 %24, i32* %25, align 4, !tbaa !0
-  %26 = load i32* @a13, align 4, !tbaa !0
-  %27 = load i32** @b13, align 4, !tbaa !3
-  store i32 %26, i32* %27, align 4, !tbaa !0
-  %28 = load i32* @a14, align 4, !tbaa !0
-  %29 = load i32** @b14, align 4, !tbaa !3
-  store i32 %28, i32* %29, align 4, !tbaa !0
-  %30 = load i32* @a15, align 4, !tbaa !0
-  %31 = load i32** @b15, align 4, !tbaa !3
-  store i32 %30, i32* %31, align 4, !tbaa !0
-  %32 = load i32* @a16, align 4, !tbaa !0
-  %33 = load i32** @b16, align 4, !tbaa !3
-  store i32 %32, i32* %33, align 4, !tbaa !0
-  %34 = load i32* @a17, align 4, !tbaa !0
-  %35 = load i32** @b17, align 4, !tbaa !3
-  store i32 %34, i32* %35, align 4, !tbaa !0
-  %36 = load i32* @a18, align 4, !tbaa !0
-  %37 = load i32** @b18, align 4, !tbaa !3
-  store i32 %36, i32* %37, align 4, !tbaa !0
-  %38 = load i32* @a19, align 4, !tbaa !0
-  %39 = load i32** @b19, align 4, !tbaa !3
-  store i32 %38, i32* %39, align 4, !tbaa !0
-  %40 = load i32* @a20, align 4, !tbaa !0
-  %41 = load i32** @b20, align 4, !tbaa !3
-  store i32 %40, i32* %41, align 4, !tbaa !0
-  %42 = load i32* @a21, align 4, !tbaa !0
-  %43 = load i32** @b21, align 4, !tbaa !3
-  store i32 %42, i32* %43, align 4, !tbaa !0
-  %44 = load i32* @a22, align 4, !tbaa !0
-  %45 = load i32** @b22, align 4, !tbaa !3
-  store i32 %44, i32* %45, align 4, !tbaa !0
-  %46 = load i32* @a23, align 4, !tbaa !0
-  %47 = load i32** @b23, align 4, !tbaa !3
-  store i32 %46, i32* %47, align 4, !tbaa !0
-  %48 = load i32* @a24, align 4, !tbaa !0
-  %49 = load i32** @b24, align 4, !tbaa !3
-  store i32 %48, i32* %49, align 4, !tbaa !0
-  %50 = load i32* @a25, align 4, !tbaa !0
-  %51 = load i32** @b25, align 4, !tbaa !3
-  store i32 %50, i32* %51, align 4, !tbaa !0
-  %52 = load i32* @a26, align 4, !tbaa !0
-  %53 = load i32** @b26, align 4, !tbaa !3
-  store i32 %52, i32* %53, align 4, !tbaa !0
-  %54 = load i32* @a27, align 4, !tbaa !0
-  %55 = load i32** @b27, align 4, !tbaa !3
-  store i32 %54, i32* %55, align 4, !tbaa !0
-  %56 = load i32* @a28, align 4, !tbaa !0
-  %57 = load i32** @b28, align 4, !tbaa !3
-  store i32 %56, i32* %57, align 4, !tbaa !0
-  %58 = load i32* @a29, align 4, !tbaa !0
-  %59 = load i32** @b29, align 4, !tbaa !3
-  store i32 %58, i32* %59, align 4, !tbaa !0
-  %60 = load i32* @a0, align 4, !tbaa !0
-  %61 = load i32** @c0, align 4, !tbaa !3
-  store i32 %60, i32* %61, align 4, !tbaa !0
-  %62 = load i32* @a1, align 4, !tbaa !0
-  %63 = load i32** @c1, align 4, !tbaa !3
-  store i32 %62, i32* %63, align 4, !tbaa !0
-  %64 = load i32* @a2, align 4, !tbaa !0
-  %65 = load i32** @c2, align 4, !tbaa !3
-  store i32 %64, i32* %65, align 4, !tbaa !0
-  %66 = load i32* @a3, align 4, !tbaa !0
-  %67 = load i32** @c3, align 4, !tbaa !3
-  store i32 %66, i32* %67, align 4, !tbaa !0
-  %68 = load i32* @a4, align 4, !tbaa !0
-  %69 = load i32** @c4, align 4, !tbaa !3
-  store i32 %68, i32* %69, align 4, !tbaa !0
-  %70 = load i32* @a5, align 4, !tbaa !0
-  %71 = load i32** @c5, align 4, !tbaa !3
-  store i32 %70, i32* %71, align 4, !tbaa !0
-  %72 = load i32* @a6, align 4, !tbaa !0
-  %73 = load i32** @c6, align 4, !tbaa !3
-  store i32 %72, i32* %73, align 4, !tbaa !0
-  %74 = load i32* @a7, align 4, !tbaa !0
-  %75 = load i32** @c7, align 4, !tbaa !3
-  store i32 %74, i32* %75, align 4, !tbaa !0
-  %76 = load i32* @a8, align 4, !tbaa !0
-  %77 = load i32** @c8, align 4, !tbaa !3
-  store i32 %76, i32* %77, align 4, !tbaa !0
-  %78 = load i32* @a9, align 4, !tbaa !0
-  %79 = load i32** @c9, align 4, !tbaa !3
-  store i32 %78, i32* %79, align 4, !tbaa !0
-  %80 = load i32* @a10, align 4, !tbaa !0
-  %81 = load i32** @c10, align 4, !tbaa !3
-  store i32 %80, i32* %81, align 4, !tbaa !0
-  %82 = load i32* @a11, align 4, !tbaa !0
-  %83 = load i32** @c11, align 4, !tbaa !3
-  store i32 %82, i32* %83, align 4, !tbaa !0
-  %84 = load i32* @a12, align 4, !tbaa !0
-  %85 = load i32** @c12, align 4, !tbaa !3
-  store i32 %84, i32* %85, align 4, !tbaa !0
-  %86 = load i32* @a13, align 4, !tbaa !0
-  %87 = load i32** @c13, align 4, !tbaa !3
-  store i32 %86, i32* %87, align 4, !tbaa !0
-  %88 = load i32* @a14, align 4, !tbaa !0
-  %89 = load i32** @c14, align 4, !tbaa !3
-  store i32 %88, i32* %89, align 4, !tbaa !0
-  %90 = load i32* @a15, align 4, !tbaa !0
-  %91 = load i32** @c15, align 4, !tbaa !3
-  store i32 %90, i32* %91, align 4, !tbaa !0
-  %92 = load i32* @a16, align 4, !tbaa !0
-  %93 = load i32** @c16, align 4, !tbaa !3
-  store i32 %92, i32* %93, align 4, !tbaa !0
-  %94 = load i32* @a17, align 4, !tbaa !0
-  %95 = load i32** @c17, align 4, !tbaa !3
-  store i32 %94, i32* %95, align 4, !tbaa !0
-  %96 = load i32* @a18, align 4, !tbaa !0
-  %97 = load i32** @c18, align 4, !tbaa !3
-  store i32 %96, i32* %97, align 4, !tbaa !0
-  %98 = load i32* @a19, align 4, !tbaa !0
-  %99 = load i32** @c19, align 4, !tbaa !3
-  store i32 %98, i32* %99, align 4, !tbaa !0
-  %100 = load i32* @a20, align 4, !tbaa !0
-  %101 = load i32** @c20, align 4, !tbaa !3
-  store i32 %100, i32* %101, align 4, !tbaa !0
-  %102 = load i32* @a21, align 4, !tbaa !0
-  %103 = load i32** @c21, align 4, !tbaa !3
-  store i32 %102, i32* %103, align 4, !tbaa !0
-  %104 = load i32* @a22, align 4, !tbaa !0
-  %105 = load i32** @c22, align 4, !tbaa !3
-  store i32 %104, i32* %105, align 4, !tbaa !0
-  %106 = load i32* @a23, align 4, !tbaa !0
-  %107 = load i32** @c23, align 4, !tbaa !3
-  store i32 %106, i32* %107, align 4, !tbaa !0
-  %108 = load i32* @a24, align 4, !tbaa !0
-  %109 = load i32** @c24, align 4, !tbaa !3
-  store i32 %108, i32* %109, align 4, !tbaa !0
-  %110 = load i32* @a25, align 4, !tbaa !0
-  %111 = load i32** @c25, align 4, !tbaa !3
-  store i32 %110, i32* %111, align 4, !tbaa !0
-  %112 = load i32* @a26, align 4, !tbaa !0
-  %113 = load i32** @c26, align 4, !tbaa !3
-  store i32 %112, i32* %113, align 4, !tbaa !0
-  %114 = load i32* @a27, align 4, !tbaa !0
-  %115 = load i32** @c27, align 4, !tbaa !3
-  store i32 %114, i32* %115, align 4, !tbaa !0
-  %116 = load i32* @a28, align 4, !tbaa !0
-  %117 = load i32** @c28, align 4, !tbaa !3
-  store i32 %116, i32* %117, align 4, !tbaa !0
-  %118 = load i32* @a29, align 4, !tbaa !0
-  %119 = load i32** @c29, align 4, !tbaa !3
-  store i32 %118, i32* %119, align 4, !tbaa !0
-  %120 = load i32* @a0, align 4, !tbaa !0
+  %0 = load i32* @a0, align 4
+  %1 = load i32** @b0, align 4
+  store i32 %0, i32* %1, align 4
+  %2 = load i32* @a1, align 4
+  %3 = load i32** @b1, align 4
+  store i32 %2, i32* %3, align 4
+  %4 = load i32* @a2, align 4
+  %5 = load i32** @b2, align 4
+  store i32 %4, i32* %5, align 4
+  %6 = load i32* @a3, align 4
+  %7 = load i32** @b3, align 4
+  store i32 %6, i32* %7, align 4
+  %8 = load i32* @a4, align 4
+  %9 = load i32** @b4, align 4
+  store i32 %8, i32* %9, align 4
+  %10 = load i32* @a5, align 4
+  %11 = load i32** @b5, align 4
+  store i32 %10, i32* %11, align 4
+  %12 = load i32* @a6, align 4
+  %13 = load i32** @b6, align 4
+  store i32 %12, i32* %13, align 4
+  %14 = load i32* @a7, align 4
+  %15 = load i32** @b7, align 4
+  store i32 %14, i32* %15, align 4
+  %16 = load i32* @a8, align 4
+  %17 = load i32** @b8, align 4
+  store i32 %16, i32* %17, align 4
+  %18 = load i32* @a9, align 4
+  %19 = load i32** @b9, align 4
+  store i32 %18, i32* %19, align 4
+  %20 = load i32* @a10, align 4
+  %21 = load i32** @b10, align 4
+  store i32 %20, i32* %21, align 4
+  %22 = load i32* @a11, align 4
+  %23 = load i32** @b11, align 4
+  store i32 %22, i32* %23, align 4
+  %24 = load i32* @a12, align 4
+  %25 = load i32** @b12, align 4
+  store i32 %24, i32* %25, align 4
+  %26 = load i32* @a13, align 4
+  %27 = load i32** @b13, align 4
+  store i32 %26, i32* %27, align 4
+  %28 = load i32* @a14, align 4
+  %29 = load i32** @b14, align 4
+  store i32 %28, i32* %29, align 4
+  %30 = load i32* @a15, align 4
+  %31 = load i32** @b15, align 4
+  store i32 %30, i32* %31, align 4
+  %32 = load i32* @a16, align 4
+  %33 = load i32** @b16, align 4
+  store i32 %32, i32* %33, align 4
+  %34 = load i32* @a17, align 4
+  %35 = load i32** @b17, align 4
+  store i32 %34, i32* %35, align 4
+  %36 = load i32* @a18, align 4
+  %37 = load i32** @b18, align 4
+  store i32 %36, i32* %37, align 4
+  %38 = load i32* @a19, align 4
+  %39 = load i32** @b19, align 4
+  store i32 %38, i32* %39, align 4
+  %40 = load i32* @a20, align 4
+  %41 = load i32** @b20, align 4
+  store i32 %40, i32* %41, align 4
+  %42 = load i32* @a21, align 4
+  %43 = load i32** @b21, align 4
+  store i32 %42, i32* %43, align 4
+  %44 = load i32* @a22, align 4
+  %45 = load i32** @b22, align 4
+  store i32 %44, i32* %45, align 4
+  %46 = load i32* @a23, align 4
+  %47 = load i32** @b23, align 4
+  store i32 %46, i32* %47, align 4
+  %48 = load i32* @a24, align 4
+  %49 = load i32** @b24, align 4
+  store i32 %48, i32* %49, align 4
+  %50 = load i32* @a25, align 4
+  %51 = load i32** @b25, align 4
+  store i32 %50, i32* %51, align 4
+  %52 = load i32* @a26, align 4
+  %53 = load i32** @b26, align 4
+  store i32 %52, i32* %53, align 4
+  %54 = load i32* @a27, align 4
+  %55 = load i32** @b27, align 4
+  store i32 %54, i32* %55, align 4
+  %56 = load i32* @a28, align 4
+  %57 = load i32** @b28, align 4
+  store i32 %56, i32* %57, align 4
+  %58 = load i32* @a29, align 4
+  %59 = load i32** @b29, align 4
+  store i32 %58, i32* %59, align 4
+  %60 = load i32* @a0, align 4
+  %61 = load i32** @c0, align 4
+  store i32 %60, i32* %61, align 4
+  %62 = load i32* @a1, align 4
+  %63 = load i32** @c1, align 4
+  store i32 %62, i32* %63, align 4
+  %64 = load i32* @a2, align 4
+  %65 = load i32** @c2, align 4
+  store i32 %64, i32* %65, align 4
+  %66 = load i32* @a3, align 4
+  %67 = load i32** @c3, align 4
+  store i32 %66, i32* %67, align 4
+  %68 = load i32* @a4, align 4
+  %69 = load i32** @c4, align 4
+  store i32 %68, i32* %69, align 4
+  %70 = load i32* @a5, align 4
+  %71 = load i32** @c5, align 4
+  store i32 %70, i32* %71, align 4
+  %72 = load i32* @a6, align 4
+  %73 = load i32** @c6, align 4
+  store i32 %72, i32* %73, align 4
+  %74 = load i32* @a7, align 4
+  %75 = load i32** @c7, align 4
+  store i32 %74, i32* %75, align 4
+  %76 = load i32* @a8, align 4
+  %77 = load i32** @c8, align 4
+  store i32 %76, i32* %77, align 4
+  %78 = load i32* @a9, align 4
+  %79 = load i32** @c9, align 4
+  store i32 %78, i32* %79, align 4
+  %80 = load i32* @a10, align 4
+  %81 = load i32** @c10, align 4
+  store i32 %80, i32* %81, align 4
+  %82 = load i32* @a11, align 4
+  %83 = load i32** @c11, align 4
+  store i32 %82, i32* %83, align 4
+  %84 = load i32* @a12, align 4
+  %85 = load i32** @c12, align 4
+  store i32 %84, i32* %85, align 4
+  %86 = load i32* @a13, align 4
+  %87 = load i32** @c13, align 4
+  store i32 %86, i32* %87, align 4
+  %88 = load i32* @a14, align 4
+  %89 = load i32** @c14, align 4
+  store i32 %88, i32* %89, align 4
+  %90 = load i32* @a15, align 4
+  %91 = load i32** @c15, align 4
+  store i32 %90, i32* %91, align 4
+  %92 = load i32* @a16, align 4
+  %93 = load i32** @c16, align 4
+  store i32 %92, i32* %93, align 4
+  %94 = load i32* @a17, align 4
+  %95 = load i32** @c17, align 4
+  store i32 %94, i32* %95, align 4
+  %96 = load i32* @a18, align 4
+  %97 = load i32** @c18, align 4
+  store i32 %96, i32* %97, align 4
+  %98 = load i32* @a19, align 4
+  %99 = load i32** @c19, align 4
+  store i32 %98, i32* %99, align 4
+  %100 = load i32* @a20, align 4
+  %101 = load i32** @c20, align 4
+  store i32 %100, i32* %101, align 4
+  %102 = load i32* @a21, align 4
+  %103 = load i32** @c21, align 4
+  store i32 %102, i32* %103, align 4
+  %104 = load i32* @a22, align 4
+  %105 = load i32** @c22, align 4
+  store i32 %104, i32* %105, align 4
+  %106 = load i32* @a23, align 4
+  %107 = load i32** @c23, align 4
+  store i32 %106, i32* %107, align 4
+  %108 = load i32* @a24, align 4
+  %109 = load i32** @c24, align 4
+  store i32 %108, i32* %109, align 4
+  %110 = load i32* @a25, align 4
+  %111 = load i32** @c25, align 4
+  store i32 %110, i32* %111, align 4
+  %112 = load i32* @a26, align 4
+  %113 = load i32** @c26, align 4
+  store i32 %112, i32* %113, align 4
+  %114 = load i32* @a27, align 4
+  %115 = load i32** @c27, align 4
+  store i32 %114, i32* %115, align 4
+  %116 = load i32* @a28, align 4
+  %117 = load i32** @c28, align 4
+  store i32 %116, i32* %117, align 4
+  %118 = load i32* @a29, align 4
+  %119 = load i32** @c29, align 4
+  store i32 %118, i32* %119, align 4
+  %120 = load i32* @a0, align 4
   ret i32 %120
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/Mips/rotate.ll b/test/CodeGen/Mips/rotate.ll
index 4f3cfb7..813bbdf 100644
--- a/test/CodeGen/Mips/rotate.ll
+++ b/test/CodeGen/Mips/rotate.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -march=mips -mcpu=mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips32r2 -mattr=+mips16 -soft-float -mips16-hard-float   < %s | FileCheck %s -check-prefix=mips16 
 
 ; CHECK:  rotrv $2, $4
+; mips16: .ent rot0
 define i32 @rot0(i32 %a, i32 %b) nounwind readnone {
 entry:
   %shl = shl i32 %a, %b
@@ -11,6 +13,7 @@ entry:
 }
 
 ; CHECK:  rotr  $2, $4, 22
+; mips16: .ent rot1
 define i32 @rot1(i32 %a) nounwind readnone {
 entry:
   %shl = shl i32 %a, 10
@@ -20,6 +23,7 @@ entry:
 }
 
 ; CHECK:  rotrv $2, $4, $5
+; mips16: .ent rot2
 define i32 @rot2(i32 %a, i32 %b) nounwind readnone {
 entry:
   %shr = lshr i32 %a, %b
@@ -30,6 +34,7 @@ entry:
 }
 
 ; CHECK:  rotr  $2, $4, 10
+; mips16: .ent rot3
 define i32 @rot3(i32 %a) nounwind readnone {
 entry:
   %shr = lshr i32 %a, 10
diff --git a/test/CodeGen/Mips/sel1c.ll b/test/CodeGen/Mips/sel1c.ll
new file mode 100644
index 0000000..4c4784d
--- /dev/null
+++ b/test/CodeGen/Mips/sel1c.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+
+@i = global i32 1, align 4
+@j = global i32 2, align 4
+@k = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define void @t() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, %1
+  %cond = select i1 %cmp, i32 1, i32 2
+  store i32 %cond, i32* @k, align 4
+  ret void
+; cond-b-short:	bteqz	$BB0_{{[0-9]+}}  # 16 bit inst
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+
diff --git a/test/CodeGen/Mips/sel2c.ll b/test/CodeGen/Mips/sel2c.ll
new file mode 100644
index 0000000..25dfaa9
--- /dev/null
+++ b/test/CodeGen/Mips/sel2c.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -soft-float -mips16-hard-float -relocation-model=pic -mips16-constant-islands   < %s | FileCheck %s -check-prefix=cond-b-short
+
+@i = global i32 1, align 4
+@j = global i32 2, align 4
+@k = common global i32 0, align 4
+
+; Function Attrs: nounwind optsize
+define void @t() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp ne i32 %0, %1
+  %cond = select i1 %cmp, i32 1, i32 2
+  store i32 %cond, i32* @k, align 4
+; cond-b-short:	btnez	$BB0_{{[0-9]+}}  # 16 bit inst
+  ret void
+}
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+
diff --git a/test/CodeGen/Mips/simplebr.ll b/test/CodeGen/Mips/simplebr.ll
new file mode 100644
index 0000000..a1d6367
--- /dev/null
+++ b/test/CodeGen/Mips/simplebr.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=mipsel-linux-gnu -march=mipsel -mcpu=mips16 -mips16-hard-float -soft-float -relocation-model=static < %s | FileCheck %s -check-prefix=CHECK-STATIC16
+
+; ModuleID = 'simplebr.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-n32-S64"
+target triple = "mips--linux-gnu"
+
+@i = common global i32 0, align 4
+
+; Function Attrs: nounwind
+define void @foo() #0 {
+entry:
+  %0 = load i32* @i, align 4
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  call void bitcast (void (...)* @goo to void ()*)()
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  call void bitcast (void (...)* @hoo to void ()*)()
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; CHECK-STATIC16:	b	$BB{{[0-9]+}}_{{[0-9]+}} # 16 bit inst
+
+declare void @goo(...) #1
+
+declare void @hoo(...) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+
+
diff --git a/test/CodeGen/Mips/stack-alignment.ll b/test/CodeGen/Mips/stack-alignment.ll
new file mode 100644
index 0000000..b18f966
--- /dev/null
+++ b/test/CodeGen/Mips/stack-alignment.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mipsel -mattr=+fp64 < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=64
+
+; 32:      addiu  $sp, $sp, -8
+; 64:      addiu  $sp, $sp, -16
+
+define i32 @foo1() #0 {
+entry:
+  ret i32 14
+}
+
+attributes #0 = { "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/Mips/tailcall.ll b/test/CodeGen/Mips/tailcall.ll
index bcd33fc..30f47ab 100644
--- a/test/CodeGen/Mips/tailcall.ll
+++ b/test/CodeGen/Mips/tailcall.ll
@@ -243,3 +243,16 @@ entry:
   ret i32 %call
 }
 
+; Check that there is a chain edge between the load and store nodes.
+;
+; PIC32-LABEL: caller14:
+; PIC32: lw ${{[0-9]+}}, 16($sp)
+; PIC32: sw $4, 16($sp)
+
+define void @caller14(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+entry:
+  tail call void @callee14(i32 %e, i32 %b, i32 %c, i32 %d, i32 %a)
+  ret void
+}
+
+declare void @callee14(i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/Mips/tnaked.ll b/test/CodeGen/Mips/tnaked.ll
index edf1ecf..08f1ab5 100644
--- a/test/CodeGen/Mips/tnaked.ll
+++ b/test/CodeGen/Mips/tnaked.ll
@@ -25,5 +25,5 @@ entry:
 ; CHECK:	.fmask	0x00000000,0
 ; CHECK: 	addiu	$sp, $sp, -8
 
-attributes #0 = { naked noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { naked noinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/NVPTX/bug17709.ll b/test/CodeGen/NVPTX/bug17709.ll
new file mode 100644
index 0000000..92f0fcb1
--- /dev/null
+++ b/test/CodeGen/NVPTX/bug17709.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+; ModuleID = '__kernelgen_main_module'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define linker_private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) {
+entry:
+  ;unreachable
+  %t0 = insertvalue {double, double} undef, double 1.0, 0
+  %t1 = insertvalue {double, double} %t0, double 1.0, 1
+  ret { double, double } %t1
+}
+
+%struct.descriptor_dimension.0.52 = type { i64, i64, i64 }
+%"struct.array2_complex(kind=8).37.18.70" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+%"struct.array2_complex(kind=8).43.5.57" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] }
+@replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096
+
+; CHECK: .visible .entry __kernelgen_main
+define ptx_kernel void @__kernelgen_main(i32* nocapture %args, i32*) {
+entry:
+  %1 = tail call ptx_device { double, double } bitcast ({ double, double } (%"struct.array2_complex(kind=8).43.5.57"*)* @__utils1_MOD_trace to { double, double } (%"struct.array2_complex(kind=8).37.18.70"*)*)(%"struct.array2_complex(kind=8).37.18.70"* noalias @replacementOfAlloca8)
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/callchain.ll b/test/CodeGen/NVPTX/callchain.ll
new file mode 100644
index 0000000..60b118b
--- /dev/null
+++ b/test/CodeGen/NVPTX/callchain.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx"
+
+define void @foo(i8* %ptr) {
+  %fnptr = bitcast i8* %ptr to void ()*
+; CHECK: prototype_0 : .callprototype ()_ ()
+  tail call void %fnptr()
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/constant-vectors.ll b/test/CodeGen/NVPTX/constant-vectors.ll
new file mode 100644
index 0000000..208c2d9
--- /dev/null
+++ b/test/CodeGen/NVPTX/constant-vectors.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-nvidia-cuda"
+
+; CHECK: .visible .global .align 16 .b8 testArray[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+@testArray = constant [2 x <4 x i8>] [<4 x i8> <i8 0, i8 1, i8 2, i8 3>, <4 x i8> <i8 4, i8 5, i8 6, i8 7>], align 16
diff --git a/test/CodeGen/NVPTX/implicit-def.ll b/test/CodeGen/NVPTX/implicit-def.ll
new file mode 100644
index 0000000..06d3d56
--- /dev/null
+++ b/test/CodeGen/NVPTX/implicit-def.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 -asm-verbose=1 | FileCheck %s
+
+; CHECK: // implicit-def: %f[[F0:[0-9]+]]
+; CHECK: add.f32         %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]];
+define float @foo(float %a) {
+  %ret = fadd float %a, undef
+  ret float %ret
+}
+
diff --git a/test/CodeGen/NVPTX/inline-asm.ll b/test/CodeGen/NVPTX/inline-asm.ll
new file mode 100644
index 0000000..d76eb42
--- /dev/null
+++ b/test/CodeGen/NVPTX/inline-asm.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+
+define float @test(float %x) {
+entry:
+; CHECK: ex2.approx.ftz.f32 %f{{[0-9]+}}, %f{{[0-9]+}}
+  %0 = call float asm "ex2.approx.ftz.f32 $0, $1;", "=f,f"(float %x)
+  ret float %0
+}
diff --git a/test/CodeGen/NVPTX/lit.local.cfg b/test/CodeGen/NVPTX/lit.local.cfg
index 7180c84..85cf8c2 100644
--- a/test/CodeGen/NVPTX/lit.local.cfg
+++ b/test/CodeGen/NVPTX/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'NVPTX' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/NVPTX/pr17529.ll b/test/CodeGen/NVPTX/pr17529.ll
new file mode 100644
index 0000000..a162142
--- /dev/null
+++ b/test/CodeGen/NVPTX/pr17529.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; Function Attrs: nounwind
+; CHECK: .func kernelgen_memcpy
+define ptx_device void @kernelgen_memcpy(i8* nocapture %dst) #0 {
+entry:
+  br i1 undef, label %for.end, label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %scevgep9 = getelementptr i8* %dst, i64 %index
+  %scevgep910 = bitcast i8* %scevgep9 to <4 x i8>*
+  store <4 x i8> undef, <4 x i8>* %scevgep910, align 1
+  %index.next = add i64 %index, 4
+  %0 = icmp eq i64 undef, %index.next
+  br i1 %0, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  br i1 undef, label %for.end, label %for.body.preheader1
+
+for.body.preheader1:                              ; preds = %middle.block
+  %scevgep2 = getelementptr i8* %dst, i64 0
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader1
+  %lsr.iv3 = phi i8* [ %scevgep2, %for.body.preheader1 ], [ %scevgep4, %for.body ]
+  store i8 undef, i8* %lsr.iv3, align 1
+  %scevgep4 = getelementptr i8* %lsr.iv3, i64 1
+  br label %for.body
+
+for.end:                                          ; preds = %middle.block, %entry
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/NVPTX/vec8.ll b/test/CodeGen/NVPTX/vec8.ll
new file mode 100644
index 0000000..03f5cfc
--- /dev/null
+++ b/test/CodeGen/NVPTX/vec8.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+target triple = "nvptx-unknown-cuda"
+
+; CHECK: .visible .func foo
+define void @foo(<8 x i8> %a, i8* %b) {
+  %t0 = extractelement <8 x i8> %a, i32 0
+; CHECK-DAG: ld.param.v4.u8
+; CHECK-DAG: ld.param.u32
+  store i8 %t0, i8* %b
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/2010-02-12-saveCR.ll b/test/CodeGen/PowerPC/2010-02-12-saveCR.ll
index 097611a..b0c37b8 100644
--- a/test/CodeGen/PowerPC/2010-02-12-saveCR.ll
+++ b/test/CodeGen/PowerPC/2010-02-12-saveCR.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=powerpc-apple-darwin -mcpu=g4 | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc-apple-darwin -mcpu=g4 -break-anti-dependencies=none | FileCheck %s
 ; ModuleID = 'hh.c'
 target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128-n32"
 target triple = "powerpc-apple-darwin9.6"
diff --git a/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.ll b/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.ll
index 635062b..9bf25c8 100644
--- a/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.ll
+++ b/test/CodeGen/PowerPC/2013-07-01-PHIElimBug.ll
@@ -25,4 +25,4 @@ if.end1018:                                       ; preds = %for.end957, %for.en
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/Frames-alloca.ll b/test/CodeGen/PowerPC/Frames-alloca.ll
index 28dd08c..4588bc0 100644
--- a/test/CodeGen/PowerPC/Frames-alloca.ll
+++ b/test/CodeGen/PowerPC/Frames-alloca.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PPC32
-; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PPC64
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=PPC32-NOFP
-; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=PPC64-NOFP
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PPC32
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=PPC32-RS
-; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=PPC32-RS-NOFP
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=CHECK-PPC32
+; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=CHECK-PPC64
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=CHECK-PPC32-NOFP
+; RUN: llc < %s -march=ppc64 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=CHECK-PPC64-NOFP
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=CHECK-PPC32
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 | FileCheck %s -check-prefix=CHECK-PPC32-RS
+; RUN: llc < %s -march=ppc32 -mtriple=powerpc-apple-darwin8 -disable-fp-elim | FileCheck %s -check-prefix=CHECK-PPC32-RS-NOFP
 
 ; CHECK-PPC32: stw r31, -4(r1)
 ; CHECK-PPC32: lwz r1, 0(r1)
diff --git a/test/CodeGen/PowerPC/addrfuncstr.ll b/test/CodeGen/PowerPC/addrfuncstr.ll
index 60c02d4..6750b5c 100644
--- a/test/CodeGen/PowerPC/addrfuncstr.ll
+++ b/test/CodeGen/PowerPC/addrfuncstr.ll
@@ -23,5 +23,5 @@ declare i64 @fread(i8*, i64, i64, %struct._IO_FILE*) #1
 ; CHECK: .section .data.rel.ro
 ; CHECK: .quad fread
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/asym-regclass-copy.ll b/test/CodeGen/PowerPC/asym-regclass-copy.ll
index d04a6c9..b19125b 100644
--- a/test/CodeGen/PowerPC/asym-regclass-copy.ll
+++ b/test/CodeGen/PowerPC/asym-regclass-copy.ll
@@ -52,5 +52,5 @@ declare void @free(i8* nocapture) #0
 
 declare i64 @strtol(i8*, i8** nocapture, i32 signext) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/test/CodeGen/PowerPC/bdzlr.ll b/test/CodeGen/PowerPC/bdzlr.ll
index 656a858..e487558 100644
--- a/test/CodeGen/PowerPC/bdzlr.ll
+++ b/test/CodeGen/PowerPC/bdzlr.ll
@@ -35,15 +35,15 @@ for.body:                                         ; preds = %for.body.for.body_c
   %0 = phi %struct.lua_TValue.17.692* [ undef, %for.body.lr.ph ], [ %.pre, %for.body.for.body_crit_edge ]
   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body.for.body_crit_edge ]
   %tt = getelementptr inbounds %struct.lua_TValue.17.692* %0, i64 %indvars.iv, i32 1
-  %1 = load i32* %tt, align 4, !tbaa !0
-  store i32 %1, i32* undef, align 4, !tbaa !0
+  %1 = load i32* %tt, align 4
+  store i32 %1, i32* undef, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, %n
   br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge
 
 for.body.for.body_crit_edge:                      ; preds = %for.body
-  %.pre = load %struct.lua_TValue.17.692** undef, align 8, !tbaa !3
+  %.pre = load %struct.lua_TValue.17.692** undef, align 8
   br label %for.body
 
 for.end:                                          ; preds = %for.body, %if.end, %entry
@@ -57,8 +57,3 @@ for.end:                                          ; preds = %for.body, %if.end,
 }
 
 attributes #0 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/PowerPC/copysignl.ll b/test/CodeGen/PowerPC/copysignl.ll
new file mode 100644
index 0000000..4b801b7
--- /dev/null
+++ b/test/CodeGen/PowerPC/copysignl.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define double @foo_d_ll(ppc_fp128 %a, ppc_fp128 %b) #0 {
+entry:
+  %call = tail call ppc_fp128 @copysignl(ppc_fp128 %a, ppc_fp128 %b) #0
+  %conv = fptrunc ppc_fp128 %call to double
+  ret double %conv
+
+; CHECK-LABEL: @foo_d_ll
+; CHECK: fcpsgn 1, 3, 1
+; CHECK: blr
+}
+
+declare ppc_fp128 @copysignl(ppc_fp128, ppc_fp128) #0
+
+define double @foo_dl(double %a, ppc_fp128 %b) #0 {
+entry:
+  %conv = fptrunc ppc_fp128 %b to double
+  %call = tail call double @copysign(double %a, double %conv) #0
+  ret double %call
+
+; CHECK-LABEL: @foo_dl
+; CHECK: fcpsgn 1, 2, 1
+; CHECK: blr
+}
+
+declare double @copysign(double, double) #0
+
+define ppc_fp128 @foo_ll(double %a, ppc_fp128 %b) #0 {
+entry:
+  %conv = fpext double %a to ppc_fp128
+  %call = tail call ppc_fp128 @copysignl(ppc_fp128 %conv, ppc_fp128 %b) #0
+  ret ppc_fp128 %call
+
+; CHECK-LABEL: @foo_ll
+; CHECK: bl copysignl
+; CHECK: blr
+}
+
+define ppc_fp128 @foo_ld(double %a, double %b) #0 {
+entry:
+  %conv = fpext double %a to ppc_fp128
+  %conv1 = fpext double %b to ppc_fp128
+  %call = tail call ppc_fp128 @copysignl(ppc_fp128 %conv, ppc_fp128 %conv1) #0
+  ret ppc_fp128 %call
+
+; CHECK-LABEL: @foo_ld
+; CHECK: bl copysignl
+; CHECK: blr
+}
+
+define ppc_fp128 @foo_lf(double %a, float %b) #0 {
+entry:
+  %conv = fpext double %a to ppc_fp128
+  %conv1 = fpext float %b to ppc_fp128
+  %call = tail call ppc_fp128 @copysignl(ppc_fp128 %conv, ppc_fp128 %conv1) #0
+  ret ppc_fp128 %call
+
+; CHECK-LABEL: @foo_lf
+; CHECK: bl copysignl
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/cr-spills.ll b/test/CodeGen/PowerPC/cr-spills.ll
index d6df7a2..be0dbad 100644
--- a/test/CodeGen/PowerPC/cr-spills.ll
+++ b/test/CodeGen/PowerPC/cr-spills.ll
@@ -53,11 +53,11 @@ for.cond286.preheader:                            ; preds = %for.body252
 
 for.cond290.preheader:                            ; preds = %for.end520, %for.cond286.preheader
   %srcptr.31595 = phi i16* [ getelementptr inbounds ([768 x i16]* @SetupFastFullPelSearch.orig_pels, i64 0, i64 0), %for.cond286.preheader ], [ null, %for.end520 ]
-  %1 = load i32* undef, align 4, !tbaa !0
-  %2 = load i32* @weight_luma, align 4, !tbaa !0
-  %3 = load i32* @wp_luma_round, align 4, !tbaa !0
-  %4 = load i32* @luma_log_weight_denom, align 4, !tbaa !0
-  %5 = load i32* @offset_luma, align 4, !tbaa !0
+  %1 = load i32* undef, align 4
+  %2 = load i32* @weight_luma, align 4
+  %3 = load i32* @wp_luma_round, align 4
+  %4 = load i32* @luma_log_weight_denom, align 4
+  %5 = load i32* @offset_luma, align 4
   %incdec.ptr502.sum = add i64 undef, 16
   br label %for.body293
 
@@ -68,7 +68,7 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %LineSadBlk1.01587 = phi i32 [ 0, %for.cond290.preheader ], [ %add402, %for.body293 ]
   %LineSadBlk3.01586 = phi i32 [ 0, %for.cond290.preheader ], [ %add514, %for.body293 ]
   %LineSadBlk2.01585 = phi i32 [ 0, %for.cond290.preheader ], [ %add458, %for.body293 ]
-  %6 = load i16* %refptr.11590, align 2, !tbaa !3
+  %6 = load i16* %refptr.11590, align 2
   %conv294 = zext i16 %6 to i32
   %mul295 = mul nsw i32 %conv294, %2
   %add296 = add nsw i32 %mul295, %3
@@ -78,16 +78,16 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cond.i.i1514 = select i1 %cmp.i.i1513, i32 %add297, i32 0
   %cmp.i4.i1515 = icmp slt i32 %cond.i.i1514, %1
   %cond.i5.i1516 = select i1 %cmp.i4.i1515, i32 %cond.i.i1514, i32 %1
-  %7 = load i16* %srcptr.41591, align 2, !tbaa !3
+  %7 = load i16* %srcptr.41591, align 2
   %conv300 = zext i16 %7 to i32
   %sub301 = sub nsw i32 %cond.i5.i1516, %conv300
   %idxprom302 = sext i32 %sub301 to i64
   %arrayidx303 = getelementptr inbounds i32* %cond, i64 %idxprom302
-  %8 = load i32* %arrayidx303, align 4, !tbaa !0
+  %8 = load i32* %arrayidx303, align 4
   %add304 = add nsw i32 %8, %LineSadBlk0.01588
-  %9 = load i32* undef, align 4, !tbaa !0
+  %9 = load i32* undef, align 4
   %add318 = add nsw i32 %add304, %9
-  %10 = load i16* undef, align 2, !tbaa !3
+  %10 = load i16* undef, align 2
   %conv321 = zext i16 %10 to i32
   %mul322 = mul nsw i32 %conv321, %2
   %add323 = add nsw i32 %mul322, %3
@@ -100,22 +100,22 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %sub329 = sub nsw i32 %cond.i5.i1508, 0
   %idxprom330 = sext i32 %sub329 to i64
   %arrayidx331 = getelementptr inbounds i32* %cond, i64 %idxprom330
-  %11 = load i32* %arrayidx331, align 4, !tbaa !0
+  %11 = load i32* %arrayidx331, align 4
   %add332 = add nsw i32 %add318, %11
   %cmp.i.i1501 = icmp sgt i32 undef, 0
   %cond.i.i1502 = select i1 %cmp.i.i1501, i32 undef, i32 0
   %cmp.i4.i1503 = icmp slt i32 %cond.i.i1502, %1
   %cond.i5.i1504 = select i1 %cmp.i4.i1503, i32 %cond.i.i1502, i32 %1
   %incdec.ptr341 = getelementptr inbounds i16* %srcptr.41591, i64 4
-  %12 = load i16* null, align 2, !tbaa !3
+  %12 = load i16* null, align 2
   %conv342 = zext i16 %12 to i32
   %sub343 = sub nsw i32 %cond.i5.i1504, %conv342
   %idxprom344 = sext i32 %sub343 to i64
   %arrayidx345 = getelementptr inbounds i32* %cond, i64 %idxprom344
-  %13 = load i32* %arrayidx345, align 4, !tbaa !0
+  %13 = load i32* %arrayidx345, align 4
   %add346 = add nsw i32 %add332, %13
   %incdec.ptr348 = getelementptr inbounds i16* %refptr.11590, i64 5
-  %14 = load i16* null, align 2, !tbaa !3
+  %14 = load i16* null, align 2
   %conv349 = zext i16 %14 to i32
   %mul350 = mul nsw i32 %conv349, %2
   %add351 = add nsw i32 %mul350, %3
@@ -126,15 +126,15 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1499 = icmp slt i32 %cond.i.i1498, %1
   %cond.i5.i1500 = select i1 %cmp.i4.i1499, i32 %cond.i.i1498, i32 %1
   %incdec.ptr355 = getelementptr inbounds i16* %srcptr.41591, i64 5
-  %15 = load i16* %incdec.ptr341, align 2, !tbaa !3
+  %15 = load i16* %incdec.ptr341, align 2
   %conv356 = zext i16 %15 to i32
   %sub357 = sub nsw i32 %cond.i5.i1500, %conv356
   %idxprom358 = sext i32 %sub357 to i64
   %arrayidx359 = getelementptr inbounds i32* %cond, i64 %idxprom358
-  %16 = load i32* %arrayidx359, align 4, !tbaa !0
+  %16 = load i32* %arrayidx359, align 4
   %add360 = add nsw i32 %16, %LineSadBlk1.01587
   %incdec.ptr362 = getelementptr inbounds i16* %refptr.11590, i64 6
-  %17 = load i16* %incdec.ptr348, align 2, !tbaa !3
+  %17 = load i16* %incdec.ptr348, align 2
   %conv363 = zext i16 %17 to i32
   %mul364 = mul nsw i32 %conv363, %2
   %add365 = add nsw i32 %mul364, %3
@@ -145,15 +145,15 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1495 = icmp slt i32 %cond.i.i1494, %1
   %cond.i5.i1496 = select i1 %cmp.i4.i1495, i32 %cond.i.i1494, i32 %1
   %incdec.ptr369 = getelementptr inbounds i16* %srcptr.41591, i64 6
-  %18 = load i16* %incdec.ptr355, align 2, !tbaa !3
+  %18 = load i16* %incdec.ptr355, align 2
   %conv370 = zext i16 %18 to i32
   %sub371 = sub nsw i32 %cond.i5.i1496, %conv370
   %idxprom372 = sext i32 %sub371 to i64
   %arrayidx373 = getelementptr inbounds i32* %cond, i64 %idxprom372
-  %19 = load i32* %arrayidx373, align 4, !tbaa !0
+  %19 = load i32* %arrayidx373, align 4
   %add374 = add nsw i32 %add360, %19
   %incdec.ptr376 = getelementptr inbounds i16* %refptr.11590, i64 7
-  %20 = load i16* %incdec.ptr362, align 2, !tbaa !3
+  %20 = load i16* %incdec.ptr362, align 2
   %conv377 = zext i16 %20 to i32
   %mul378 = mul nsw i32 %conv377, %2
   %add379 = add nsw i32 %mul378, %3
@@ -164,14 +164,14 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1491 = icmp slt i32 %cond.i.i1490, %1
   %cond.i5.i1492 = select i1 %cmp.i4.i1491, i32 %cond.i.i1490, i32 %1
   %incdec.ptr383 = getelementptr inbounds i16* %srcptr.41591, i64 7
-  %21 = load i16* %incdec.ptr369, align 2, !tbaa !3
+  %21 = load i16* %incdec.ptr369, align 2
   %conv384 = zext i16 %21 to i32
   %sub385 = sub nsw i32 %cond.i5.i1492, %conv384
   %idxprom386 = sext i32 %sub385 to i64
   %arrayidx387 = getelementptr inbounds i32* %cond, i64 %idxprom386
-  %22 = load i32* %arrayidx387, align 4, !tbaa !0
+  %22 = load i32* %arrayidx387, align 4
   %add388 = add nsw i32 %add374, %22
-  %23 = load i16* %incdec.ptr376, align 2, !tbaa !3
+  %23 = load i16* %incdec.ptr376, align 2
   %conv391 = zext i16 %23 to i32
   %mul392 = mul nsw i32 %conv391, %2
   %add395 = add nsw i32 0, %5
@@ -180,25 +180,25 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1487 = icmp slt i32 %cond.i.i1486, %1
   %cond.i5.i1488 = select i1 %cmp.i4.i1487, i32 %cond.i.i1486, i32 %1
   %incdec.ptr397 = getelementptr inbounds i16* %srcptr.41591, i64 8
-  %24 = load i16* %incdec.ptr383, align 2, !tbaa !3
+  %24 = load i16* %incdec.ptr383, align 2
   %conv398 = zext i16 %24 to i32
   %sub399 = sub nsw i32 %cond.i5.i1488, %conv398
   %idxprom400 = sext i32 %sub399 to i64
   %arrayidx401 = getelementptr inbounds i32* %cond, i64 %idxprom400
-  %25 = load i32* %arrayidx401, align 4, !tbaa !0
+  %25 = load i32* %arrayidx401, align 4
   %add402 = add nsw i32 %add388, %25
   %incdec.ptr404 = getelementptr inbounds i16* %refptr.11590, i64 9
   %cmp.i4.i1483 = icmp slt i32 undef, %1
   %cond.i5.i1484 = select i1 %cmp.i4.i1483, i32 undef, i32 %1
-  %26 = load i16* %incdec.ptr397, align 2, !tbaa !3
+  %26 = load i16* %incdec.ptr397, align 2
   %conv412 = zext i16 %26 to i32
   %sub413 = sub nsw i32 %cond.i5.i1484, %conv412
   %idxprom414 = sext i32 %sub413 to i64
   %arrayidx415 = getelementptr inbounds i32* %cond, i64 %idxprom414
-  %27 = load i32* %arrayidx415, align 4, !tbaa !0
+  %27 = load i32* %arrayidx415, align 4
   %add416 = add nsw i32 %27, %LineSadBlk2.01585
   %incdec.ptr418 = getelementptr inbounds i16* %refptr.11590, i64 10
-  %28 = load i16* %incdec.ptr404, align 2, !tbaa !3
+  %28 = load i16* %incdec.ptr404, align 2
   %conv419 = zext i16 %28 to i32
   %mul420 = mul nsw i32 %conv419, %2
   %add421 = add nsw i32 %mul420, %3
@@ -212,10 +212,10 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %sub427 = sub nsw i32 %cond.i5.i1480, 0
   %idxprom428 = sext i32 %sub427 to i64
   %arrayidx429 = getelementptr inbounds i32* %cond, i64 %idxprom428
-  %29 = load i32* %arrayidx429, align 4, !tbaa !0
+  %29 = load i32* %arrayidx429, align 4
   %add430 = add nsw i32 %add416, %29
   %incdec.ptr432 = getelementptr inbounds i16* %refptr.11590, i64 11
-  %30 = load i16* %incdec.ptr418, align 2, !tbaa !3
+  %30 = load i16* %incdec.ptr418, align 2
   %conv433 = zext i16 %30 to i32
   %mul434 = mul nsw i32 %conv433, %2
   %add435 = add nsw i32 %mul434, %3
@@ -225,15 +225,15 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cond.i.i1474 = select i1 %cmp.i.i1473, i32 %add437, i32 0
   %cmp.i4.i1475 = icmp slt i32 %cond.i.i1474, %1
   %cond.i5.i1476 = select i1 %cmp.i4.i1475, i32 %cond.i.i1474, i32 %1
-  %31 = load i16* %incdec.ptr425, align 2, !tbaa !3
+  %31 = load i16* %incdec.ptr425, align 2
   %conv440 = zext i16 %31 to i32
   %sub441 = sub nsw i32 %cond.i5.i1476, %conv440
   %idxprom442 = sext i32 %sub441 to i64
   %arrayidx443 = getelementptr inbounds i32* %cond, i64 %idxprom442
-  %32 = load i32* %arrayidx443, align 4, !tbaa !0
+  %32 = load i32* %arrayidx443, align 4
   %add444 = add nsw i32 %add430, %32
   %incdec.ptr446 = getelementptr inbounds i16* %refptr.11590, i64 12
-  %33 = load i16* %incdec.ptr432, align 2, !tbaa !3
+  %33 = load i16* %incdec.ptr432, align 2
   %conv447 = zext i16 %33 to i32
   %mul448 = mul nsw i32 %conv447, %2
   %add449 = add nsw i32 %mul448, %3
@@ -244,15 +244,15 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1471 = icmp slt i32 %cond.i.i1470, %1
   %cond.i5.i1472 = select i1 %cmp.i4.i1471, i32 %cond.i.i1470, i32 %1
   %incdec.ptr453 = getelementptr inbounds i16* %srcptr.41591, i64 12
-  %34 = load i16* undef, align 2, !tbaa !3
+  %34 = load i16* undef, align 2
   %conv454 = zext i16 %34 to i32
   %sub455 = sub nsw i32 %cond.i5.i1472, %conv454
   %idxprom456 = sext i32 %sub455 to i64
   %arrayidx457 = getelementptr inbounds i32* %cond, i64 %idxprom456
-  %35 = load i32* %arrayidx457, align 4, !tbaa !0
+  %35 = load i32* %arrayidx457, align 4
   %add458 = add nsw i32 %add444, %35
   %incdec.ptr460 = getelementptr inbounds i16* %refptr.11590, i64 13
-  %36 = load i16* %incdec.ptr446, align 2, !tbaa !3
+  %36 = load i16* %incdec.ptr446, align 2
   %conv461 = zext i16 %36 to i32
   %mul462 = mul nsw i32 %conv461, %2
   %add463 = add nsw i32 %mul462, %3
@@ -263,12 +263,12 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1467 = icmp slt i32 %cond.i.i1466, %1
   %cond.i5.i1468 = select i1 %cmp.i4.i1467, i32 %cond.i.i1466, i32 %1
   %incdec.ptr467 = getelementptr inbounds i16* %srcptr.41591, i64 13
-  %37 = load i16* %incdec.ptr453, align 2, !tbaa !3
+  %37 = load i16* %incdec.ptr453, align 2
   %conv468 = zext i16 %37 to i32
   %sub469 = sub nsw i32 %cond.i5.i1468, %conv468
   %idxprom470 = sext i32 %sub469 to i64
   %arrayidx471 = getelementptr inbounds i32* %cond, i64 %idxprom470
-  %38 = load i32* %arrayidx471, align 4, !tbaa !0
+  %38 = load i32* %arrayidx471, align 4
   %add472 = add nsw i32 %38, %LineSadBlk3.01586
   %incdec.ptr474 = getelementptr inbounds i16* %refptr.11590, i64 14
   %add477 = add nsw i32 0, %3
@@ -279,15 +279,15 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1463 = icmp slt i32 %cond.i.i1462, %1
   %cond.i5.i1464 = select i1 %cmp.i4.i1463, i32 %cond.i.i1462, i32 %1
   %incdec.ptr481 = getelementptr inbounds i16* %srcptr.41591, i64 14
-  %39 = load i16* %incdec.ptr467, align 2, !tbaa !3
+  %39 = load i16* %incdec.ptr467, align 2
   %conv482 = zext i16 %39 to i32
   %sub483 = sub nsw i32 %cond.i5.i1464, %conv482
   %idxprom484 = sext i32 %sub483 to i64
   %arrayidx485 = getelementptr inbounds i32* %cond, i64 %idxprom484
-  %40 = load i32* %arrayidx485, align 4, !tbaa !0
+  %40 = load i32* %arrayidx485, align 4
   %add486 = add nsw i32 %add472, %40
   %incdec.ptr488 = getelementptr inbounds i16* %refptr.11590, i64 15
-  %41 = load i16* %incdec.ptr474, align 2, !tbaa !3
+  %41 = load i16* %incdec.ptr474, align 2
   %conv489 = zext i16 %41 to i32
   %mul490 = mul nsw i32 %conv489, %2
   %add491 = add nsw i32 %mul490, %3
@@ -298,14 +298,14 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cmp.i4.i1459 = icmp slt i32 %cond.i.i1458, %1
   %cond.i5.i1460 = select i1 %cmp.i4.i1459, i32 %cond.i.i1458, i32 %1
   %incdec.ptr495 = getelementptr inbounds i16* %srcptr.41591, i64 15
-  %42 = load i16* %incdec.ptr481, align 2, !tbaa !3
+  %42 = load i16* %incdec.ptr481, align 2
   %conv496 = zext i16 %42 to i32
   %sub497 = sub nsw i32 %cond.i5.i1460, %conv496
   %idxprom498 = sext i32 %sub497 to i64
   %arrayidx499 = getelementptr inbounds i32* %cond, i64 %idxprom498
-  %43 = load i32* %arrayidx499, align 4, !tbaa !0
+  %43 = load i32* %arrayidx499, align 4
   %add500 = add nsw i32 %add486, %43
-  %44 = load i16* %incdec.ptr488, align 2, !tbaa !3
+  %44 = load i16* %incdec.ptr488, align 2
   %conv503 = zext i16 %44 to i32
   %mul504 = mul nsw i32 %conv503, %2
   %add505 = add nsw i32 %mul504, %3
@@ -315,22 +315,22 @@ for.body293:                                      ; preds = %for.body293, %for.c
   %cond.i.i1454 = select i1 %cmp.i.i1453, i32 %add507, i32 0
   %cmp.i4.i1455 = icmp slt i32 %cond.i.i1454, %1
   %cond.i5.i1456 = select i1 %cmp.i4.i1455, i32 %cond.i.i1454, i32 %1
-  %45 = load i16* %incdec.ptr495, align 2, !tbaa !3
+  %45 = load i16* %incdec.ptr495, align 2
   %conv510 = zext i16 %45 to i32
   %sub511 = sub nsw i32 %cond.i5.i1456, %conv510
   %idxprom512 = sext i32 %sub511 to i64
   %arrayidx513 = getelementptr inbounds i32* %cond, i64 %idxprom512
-  %46 = load i32* %arrayidx513, align 4, !tbaa !0
+  %46 = load i32* %arrayidx513, align 4
   %add514 = add nsw i32 %add500, %46
   %add.ptr517 = getelementptr inbounds i16* %refptr.11590, i64 %incdec.ptr502.sum
   %exitcond1692 = icmp eq i32 undef, 4
   br i1 %exitcond1692, label %for.end520, label %for.body293
 
 for.end520:                                       ; preds = %for.body293
-  store i32 %add346, i32* undef, align 4, !tbaa !0
-  store i32 %add402, i32* undef, align 4, !tbaa !0
-  store i32 %add458, i32* undef, align 4, !tbaa !0
-  store i32 %add514, i32* null, align 4, !tbaa !0
+  store i32 %add346, i32* undef, align 4
+  store i32 %add402, i32* undef, align 4
+  store i32 %add458, i32* undef, align 4
+  store i32 %add514, i32* null, align 4
   br i1 undef, label %for.end543, label %for.cond290.preheader
 
 for.end543:                                       ; preds = %for.end520
@@ -400,10 +400,5 @@ for.end999:                                       ; preds = %for.inc997
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/PowerPC/ctr-cleanup.ll b/test/CodeGen/PowerPC/ctr-cleanup.ll
index 04e4ffb..1a669eb 100644
--- a/test/CodeGen/PowerPC/ctr-cleanup.ll
+++ b/test/CodeGen/PowerPC/ctr-cleanup.ll
@@ -22,4 +22,4 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/ctrloop-cpsgn.ll b/test/CodeGen/PowerPC/ctrloop-cpsgn.ll
new file mode 100644
index 0000000..2f04409
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-cpsgn.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mcpu=ppc | FileCheck %s
+
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux-gnu"
+
+define ppc_fp128 @foo(ppc_fp128* nocapture %n, ppc_fp128 %d) nounwind readonly {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.06 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %x.05 = phi ppc_fp128 [ %d, %entry ], [ %conv, %for.body ]
+  %arrayidx = getelementptr inbounds ppc_fp128* %n, i32 %i.06
+  %0 = load ppc_fp128* %arrayidx, align 8
+  %conv = tail call ppc_fp128 @copysignl(ppc_fp128 %x.05, ppc_fp128 %d) nounwind readonly
+  %inc = add nsw i32 %i.06, 1
+  %exitcond = icmp eq i32 %inc, 2048
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret ppc_fp128 %conv
+}
+
+declare ppc_fp128 @copysignl(ppc_fp128, ppc_fp128) #0
+
+; CHECK: @foo
+; CHECK-NOT: mtctr
+
diff --git a/test/CodeGen/PowerPC/ctrloop-le.ll b/test/CodeGen/PowerPC/ctrloop-le.ll
index 21a6fab..7b8185e 100644
--- a/test/CodeGen/PowerPC/ctrloop-le.ll
+++ b/test/CodeGen/PowerPC/ctrloop-le.ll
@@ -32,8 +32,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos2_ir_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos2_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -62,8 +61,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos4_ir_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos4_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -92,8 +90,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos8_ir_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos8_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -122,8 +119,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos16_ir_sle
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos16_ir_sle(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -443,4 +439,3 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
-
diff --git a/test/CodeGen/PowerPC/ctrloop-lt.ll b/test/CodeGen/PowerPC/ctrloop-lt.ll
index 448716d..eaab61a 100644
--- a/test/CodeGen/PowerPC/ctrloop-lt.ll
+++ b/test/CodeGen/PowerPC/ctrloop-lt.ll
@@ -33,7 +33,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 ; CHECK: test_pos2_ir_slt
 ; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos2_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -63,7 +63,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 ; CHECK: test_pos4_ir_slt
 ; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos4_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -92,8 +92,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos8_ir_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos8_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -122,8 +121,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos16_ir_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos16_ir_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -326,8 +324,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos2_rr_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos2_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -356,8 +353,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos4_rr_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos4_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -386,8 +382,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos8_rr_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos8_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -416,8 +411,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 
 ; CHECK: test_pos16_rr_slt
-; FIXME: Support this loop!
-; CHECK-NOT: bdnz
+; CHECK: bdnz
 ; a < b
 define void @test_pos16_rr_slt(i8* nocapture %p, i32 %a, i32 %b) nounwind {
 entry:
@@ -442,4 +436,3 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
-
diff --git a/test/CodeGen/PowerPC/dbg.ll b/test/CodeGen/PowerPC/dbg.ll
index 30fe19e..cb93dec 100644
--- a/test/CodeGen/PowerPC/dbg.ll
+++ b/test/CodeGen/PowerPC/dbg.ll
@@ -15,13 +15,14 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22}
 
 !0 = metadata !{i32 720913, metadata !21, i32 12, metadata !"clang version 3.1", i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, metadata !"", metadata !""} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 0}
 !3 = metadata !{metadata !5}
 !5 = metadata !{i32 720942, metadata !21, null, metadata !"main", metadata !"main", metadata !"", i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i8**)* @main, null, null, metadata !13, i32 0} ; [ DW_TAG_subprogram ]
 !6 = metadata !{i32 720937, metadata !21} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!7 = metadata !{i32 720917, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9, metadata !9, metadata !10}
 !9 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 720911, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
@@ -36,3 +37,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !19 = metadata !{i32 2, i32 3, metadata !20, null}
 !20 = metadata !{i32 720907, metadata !21, metadata !5, i32 1, i32 34, i32 0} ; [ DW_TAG_lexical_block ]
 !21 = metadata !{metadata !"dbg.c", metadata !"/src"}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/PowerPC/dyn-alloca-aligned.ll b/test/CodeGen/PowerPC/dyn-alloca-aligned.ll
index a18ada7..a5d45b8 100644
--- a/test/CodeGen/PowerPC/dyn-alloca-aligned.ll
+++ b/test/CodeGen/PowerPC/dyn-alloca-aligned.ll
@@ -12,12 +12,12 @@ entry:
   %vla = alloca i32, i64 %0, align 128
   %vla1 = alloca i32, i64 %0, align 128
   %a2 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %1 = load i32* %a2, align 4, !tbaa !0
-  store i32 %1, i32* %vla1, align 128, !tbaa !0
+  %1 = load i32* %a2, align 4
+  store i32 %1, i32* %vla1, align 128
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %2 = load i32* %b, align 4, !tbaa !0
+  %2 = load i32* %b, align 4
   %arrayidx3 = getelementptr inbounds i32* %vla1, i64 1
-  store i32 %2, i32* %arrayidx3, align 4, !tbaa !0
+  store i32 %2, i32* %arrayidx3, align 4
   call void @bar(i32* %vla1, i32* %vla) #0
   ret void
 
@@ -33,7 +33,3 @@ entry:
 }
 
 attributes #0 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/fast-isel-GEP-coalesce.ll b/test/CodeGen/PowerPC/fast-isel-GEP-coalesce.ll
new file mode 100644
index 0000000..7bdda04
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-GEP-coalesce.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+%struct.A = type { i32, [2 x [2 x i32]], i8, [3 x [3 x [3 x i32]]] }
+%struct.B = type { i32, [2 x [2 x [2 x %struct.A]]] }
+
+@arr = common global [2 x [2 x [2 x [2 x [2 x i32]]]]] zeroinitializer, align 4
+@A = common global [3 x [3 x %struct.A]] zeroinitializer, align 4
+@B = common global [2 x [2 x [2 x %struct.B]]] zeroinitializer, align 4
+
+define i32* @t1() nounwind {
+entry:
+; ELF64: t1
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([2 x [2 x [2 x [2 x [2 x i32]]]]]* @arr, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1), i32** %addr, align 4
+; ELF64: addi {{[0-9]+}}, {{[0-9]+}}, 124
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
+
+define i32* @t2() nounwind {
+entry:
+; ELF64: t2
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([3 x [3 x %struct.A]]* @A, i32 0, i32 2, i32 2, i32 3, i32 1, i32 2, i32 2), i32** %addr, align 4
+; ELF64: addi {{[0-9]+}}, {{[0-9]+}}, 1148
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
+
+define i32* @t3() nounwind {
+entry:
+; ELF64: t3
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([3 x [3 x %struct.A]]* @A, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1), i32** %addr, align 4
+; ELF64: addi {{[0-9]+}}, {{[0-9]+}}, 140
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
+
+define i32* @t4() nounwind {
+entry:
+; ELF64: t4
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([2 x [2 x [2 x %struct.B]]]* @B, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 3, i32 1, i32 2, i32 1), i32** %addr, align 4
+; ELF64: addi {{[0-9]+}}, {{[0-9]+}}, 1284
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-binary.ll b/test/CodeGen/PowerPC/fast-isel-binary.ll
new file mode 100644
index 0000000..43a6cd0
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-binary.ll
@@ -0,0 +1,137 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+; Test add with non-legal types
+
+define void @add_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; ELF64: add_i8
+  %a.addr = alloca i8, align 4
+  %0 = add i8 %a, %b
+; ELF64: add
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @add_i8_imm(i8 %a) nounwind ssp {
+entry:
+; ELF64: add_i8_imm
+  %a.addr = alloca i8, align 4
+  %0 = add i8 %a, 22;
+; ELF64: addi
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @add_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: add_i16
+  %a.addr = alloca i16, align 4
+  %0 = add i16 %a, %b
+; ELF64: add
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @add_i16_imm(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: add_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = add i16 %a, 243;
+; ELF64: addi
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+; Test or with non-legal types
+
+define void @or_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; ELF64: or_i8
+  %a.addr = alloca i8, align 4
+  %0 = or i8 %a, %b
+; ELF64: or
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @or_i8_imm(i8 %a) nounwind ssp {
+entry:
+; ELF64: or_i8_imm
+  %a.addr = alloca i8, align 4
+  %0 = or i8 %a, -13;
+; ELF64: ori
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @or_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: or_i16
+  %a.addr = alloca i16, align 4
+  %0 = or i16 %a, %b
+; ELF64: or
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @or_i16_imm(i16 %a) nounwind ssp {
+entry:
+; ELF64: or_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = or i16 %a, 273;
+; ELF64: ori
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+; Test sub with non-legal types
+
+define void @sub_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; ELF64: sub_i8
+  %a.addr = alloca i8, align 4
+  %0 = sub i8 %a, %b
+; ELF64: subf
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i8_imm(i8 %a) nounwind ssp {
+entry:
+; ELF64: sub_i8_imm
+  %a.addr = alloca i8, align 4
+  %0 = sub i8 %a, 22;
+; ELF64: addi
+  store i8 %0, i8* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; ELF64: sub_i16
+  %a.addr = alloca i16, align 4
+  %0 = sub i16 %a, %b
+; ELF64: subf
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i16_imm(i16 %a) nounwind ssp {
+entry:
+; ELF64: sub_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = sub i16 %a, 247;
+; ELF64: addi
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
+
+define void @sub_i16_badimm(i16 %a) nounwind ssp {
+entry:
+; ELF64: sub_i16_imm
+  %a.addr = alloca i16, align 4
+  %0 = sub i16 %a, -32768;
+; ELF64: subf
+  store i16 %0, i16* %a.addr, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-br-const.ll b/test/CodeGen/PowerPC/fast-isel-br-const.ll
new file mode 100644
index 0000000..2cfb8a2
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-br-const.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define i32 @t1(i32 %a, i32 %b) nounwind uwtable ssp {
+entry:
+; ELF64: t1
+  %x = add i32 %a, %b  
+  br i1 1, label %if.then, label %if.else
+; ELF64-NOT: b {{\.?}}LBB0_1
+
+if.then:                                          ; preds = %entry
+  call void @foo1()
+  br label %if.end7
+
+if.else:                                          ; preds = %entry
+  br i1 0, label %if.then2, label %if.else3
+; ELF64: b {{\.?}}LBB0_4
+
+if.then2:                                         ; preds = %if.else
+  call void @foo2()
+  br label %if.end6
+
+if.else3:                                         ; preds = %if.else
+  %y = sub i32 %a, %b
+  br i1 1, label %if.then5, label %if.end
+; ELF64-NOT: b {{\.?}}LBB0_5
+
+if.then5:                                         ; preds = %if.else3
+  call void @foo1()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then5, %if.else3
+  br label %if.end6
+
+if.end6:                                          ; preds = %if.end, %if.then2
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.end6, %if.then
+  ret i32 0
+}
+
+declare void @foo1()
+
+declare void @foo2()
diff --git a/test/CodeGen/PowerPC/fast-isel-call.ll b/test/CodeGen/PowerPC/fast-isel-call.ll
new file mode 100644
index 0000000..33a8ba9
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-call.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define i32 @t1(i8 signext %a) nounwind {
+  %1 = sext i8 %a to i32
+  ret i32 %1
+}
+
+define i32 @t2(i8 zeroext %a) nounwind {
+  %1 = zext i8 %a to i32
+  ret i32 %1
+}
+
+define i32 @t3(i16 signext %a) nounwind {
+  %1 = sext i16 %a to i32
+  ret i32 %1
+}
+
+define i32 @t4(i16 zeroext %a) nounwind {
+  %1 = zext i16 %a to i32
+  ret i32 %1
+}
+
+define void @foo(i8 %a, i16 %b) nounwind {
+; ELF64: foo
+  %1 = call i32 @t1(i8 signext %a)
+; ELF64: extsb
+  %2 = call i32 @t2(i8 zeroext %a)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+  %3 = call i32 @t3(i16 signext %b)
+; ELF64: extsh
+  %4 = call i32 @t4(i16 zeroext %b)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+
+;; A few test to check materialization
+  %5 = call i32 @t2(i8 zeroext 255)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+  %6 = call i32 @t4(i16 zeroext 65535)
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+  ret void
+}
+
+define void @foo2() nounwind {
+  %1 = call signext i16 @t5()
+  %2 = call zeroext i16 @t6()
+  %3 = call signext i8 @t7()
+  %4 = call zeroext i8 @t8()
+  ret void
+}
+
+declare signext i16 @t5();
+declare zeroext i16 @t6();
+declare signext i8 @t7();
+declare zeroext i8 @t8();
+
+define i32 @t10(i32 %argc, i8** nocapture %argv) {
+entry:
+; ELF64: t10
+  %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70)
+; ELF64: li 3, 0
+; ELF64: li 4, 248
+; ELF64: li 5, 187
+; ELF64: li 6, 28
+; ELF64: li 7, 40
+; ELF64: li 8, 186
+; ELF64: rldicl 3, 3, 0, 56
+; ELF64: rldicl 4, 4, 0, 56
+; ELF64: rldicl 5, 5, 0, 56
+; ELF64: rldicl 6, 6, 0, 56
+; ELF64: rldicl 7, 7, 0, 56
+; ELF64: rldicl 8, 8, 0, 56
+  ret i32 0
+}
+
+declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+define i32 @bar0(i32 %i) nounwind {
+  ret i32 0
+}
+
+; Function pointers are not yet implemented.
+;define void @foo3() uwtable {
+;  %fptr = alloca i32 (i32)*, align 8
+;  store i32 (i32)* @bar0, i32 (i32)** %fptr, align 8
+;  %1 = load i32 (i32)** %fptr, align 8
+;  %call = call i32 %1(i32 0)
+;  ret void
+;}
+
+; Intrinsic calls not yet implemented, and udiv isn't one for PPC anyway.
+;define i32 @LibCall(i32 %a, i32 %b) {
+;entry:
+;        %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
+;        ret i32 %tmp1
+;}
+
+declare void @float_foo(float %f) ssp
+
+define void @float_const() ssp {
+entry:
+; ELF64: float_const
+  call void @float_foo(float 0x401C666660000000)
+; ELF64: addis [[REG:[0-9]+]], 2, .LCPI[[SUF:[0-9_]+]]@toc@ha
+; ELF64: lfs 1, .LCPI[[SUF]]@toc@l([[REG]])
+  ret void
+}
+
+define void @float_reg(float %dummy, float %f) ssp {
+entry:
+; ELF64: float_reg
+  call void @float_foo(float %f)
+; ELF64: fmr 1, 2
+  ret void
+}
+
+declare void @double_foo(double %d) ssp
+
+define void @double_const() ssp {
+entry:
+; ELF64: double_const
+  call void @double_foo(double 0x1397723CCABD0000401C666660000000)
+; ELF64: addis [[REG2:[0-9]+]], 2, .LCPI[[SUF2:[0-9_]+]]@toc@ha
+; ELF64: lfd 1, .LCPI[[SUF2]]@toc@l([[REG2]])
+  ret void
+}
+
+define void @double_reg(double %dummy, double %d) ssp {
+entry:
+; ELF64: double_reg
+  call void @double_foo(double %d)
+; ELF64: fmr 1, 2
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll b/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
new file mode 100644
index 0000000..33f7a79
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
@@ -0,0 +1,289 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define void @t1a(float %a) uwtable ssp {
+entry:
+; ELF64: t1a
+  %cmp = fcmp oeq float %a, 0.000000e+00
+; ELF64: addis
+; ELF64: lfs
+; ELF64: fcmpu
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare void @foo()
+
+define void @t1b(float %a) uwtable ssp {
+entry:
+; ELF64: t1b
+  %cmp = fcmp oeq float %a, -0.000000e+00
+; ELF64: addis
+; ELF64: lfs
+; ELF64: fcmpu
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t2a(double %a) uwtable ssp {
+entry:
+; ELF64: t2a
+  %cmp = fcmp oeq double %a, 0.000000e+00
+; ELF64: addis
+; ELF64: lfd
+; ELF64: fcmpu
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t2b(double %a) uwtable ssp {
+entry:
+; ELF64: t2b
+  %cmp = fcmp oeq double %a, -0.000000e+00
+; ELF64: addis
+; ELF64: lfd
+; ELF64: fcmpu
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t4(i8 signext %a) uwtable ssp {
+entry:
+; ELF64: t4
+  %cmp = icmp eq i8 %a, -1
+; ELF64: extsb
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t5(i8 zeroext %a) uwtable ssp {
+entry:
+; ELF64: t5
+  %cmp = icmp eq i8 %a, 1
+; ELF64: extsb
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t6(i16 signext %a) uwtable ssp {
+entry:
+; ELF64: t6
+  %cmp = icmp eq i16 %a, -1
+; ELF64: extsh
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t7(i16 zeroext %a) uwtable ssp {
+entry:
+; ELF64: t7
+  %cmp = icmp eq i16 %a, 1
+; ELF64: extsh
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t8(i32 %a) uwtable ssp {
+entry:
+; ELF64: t8
+  %cmp = icmp eq i32 %a, -1
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t9(i32 %a) uwtable ssp {
+entry:
+; ELF64: t9
+  %cmp = icmp eq i32 %a, 1
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t10(i32 %a) uwtable ssp {
+entry:
+; ELF64: t10
+  %cmp = icmp eq i32 %a, 384
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t11(i32 %a) uwtable ssp {
+entry:
+; ELF64: t11
+  %cmp = icmp eq i32 %a, 4096
+; ELF64: cmpwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t12(i8 %a) uwtable ssp {
+entry:
+; ELF64: t12
+  %cmp = icmp ugt i8 %a, -113
+; ELF64: rlwinm
+; ELF64: cmplwi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t13() nounwind ssp {
+entry:
+; ELF64: t13
+  %cmp = icmp slt i32 -123, -2147483648
+; ELF64: li
+; ELF64: lis
+; ELF64: cmpw
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  ret void
+
+if.end:                                           ; preds = %entry
+  ret void
+}
+
+define void @t14(i64 %a) uwtable ssp {
+entry:
+; ELF64: t14
+  %cmp = icmp eq i64 %a, -1
+; ELF64: cmpdi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t15(i64 %a) uwtable ssp {
+entry:
+; ELF64: t15
+  %cmp = icmp eq i64 %a, 1
+; ELF64: cmpdi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t16(i64 %a) uwtable ssp {
+entry:
+; ELF64: t16
+  %cmp = icmp eq i64 %a, 384
+; ELF64: cmpdi
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define void @t17(i64 %a) uwtable ssp {
+entry:
+; ELF64: t17
+  %cmp = icmp eq i64 %a, 32768
+; Extra operand so we don't match on cmpdi.
+; ELF64: cmpd {{[0-9]+}}
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/fast-isel-conversion.ll b/test/CodeGen/PowerPC/fast-isel-conversion.ll
new file mode 100644
index 0000000..a31c312
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-conversion.ll
@@ -0,0 +1,305 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+; Test sitofp
+
+define void @sitofp_single_i64(i64 %a, float %b) nounwind ssp {
+entry:
+; ELF64: sitofp_single_i64
+  %b.addr = alloca float, align 4
+  %conv = sitofp i64 %a to float
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfids
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @sitofp_single_i32(i32 %a, float %b) nounwind ssp {
+entry:
+; ELF64: sitofp_single_i32
+  %b.addr = alloca float, align 4
+  %conv = sitofp i32 %a to float
+; ELF64: std
+; ELF64: lfiwax
+; ELF64: fcfids
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @sitofp_single_i16(i16 %a, float %b) nounwind ssp {
+entry:
+; ELF64: sitofp_single_i16
+  %b.addr = alloca float, align 4
+  %conv = sitofp i16 %a to float
+; ELF64: extsh
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfids
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @sitofp_single_i8(i8 %a) nounwind ssp {
+entry:
+; ELF64: sitofp_single_i8
+  %b.addr = alloca float, align 4
+  %conv = sitofp i8 %a to float
+; ELF64: extsb
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfids
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @sitofp_double_i32(i32 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i32
+  %b.addr = alloca double, align 8
+  %conv = sitofp i32 %a to double
+; ELF64: std
+; ELF64: lfiwax
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @sitofp_double_i64(i64 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i64
+  %b.addr = alloca double, align 8
+  %conv = sitofp i64 %a to double
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @sitofp_double_i16(i16 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i16
+  %b.addr = alloca double, align 8
+  %conv = sitofp i16 %a to double
+; ELF64: extsh
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @sitofp_double_i8(i8 %a, double %b) nounwind ssp {
+entry:
+; ELF64: sitofp_double_i8
+  %b.addr = alloca double, align 8
+  %conv = sitofp i8 %a to double
+; ELF64: extsb
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfid
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+; Test uitofp
+
+define void @uitofp_single_i64(i64 %a, float %b) nounwind ssp {
+entry:
+; ELF64: uitofp_single_i64
+  %b.addr = alloca float, align 4
+  %conv = uitofp i64 %a to float
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidus
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @uitofp_single_i32(i32 %a, float %b) nounwind ssp {
+entry:
+; ELF64: uitofp_single_i32
+  %b.addr = alloca float, align 4
+  %conv = uitofp i32 %a to float
+; ELF64: std
+; ELF64: lfiwzx
+; ELF64: fcfidus
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @uitofp_single_i16(i16 %a, float %b) nounwind ssp {
+entry:
+; ELF64: uitofp_single_i16
+  %b.addr = alloca float, align 4
+  %conv = uitofp i16 %a to float
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidus
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @uitofp_single_i8(i8 %a) nounwind ssp {
+entry:
+; ELF64: uitofp_single_i8
+  %b.addr = alloca float, align 4
+  %conv = uitofp i8 %a to float
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidus
+  store float %conv, float* %b.addr, align 4
+  ret void
+}
+
+define void @uitofp_double_i64(i64 %a, double %b) nounwind ssp {
+entry:
+; ELF64: uitofp_double_i64
+  %b.addr = alloca double, align 8
+  %conv = uitofp i64 %a to double
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidu
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @uitofp_double_i32(i32 %a, double %b) nounwind ssp {
+entry:
+; ELF64: uitofp_double_i32
+  %b.addr = alloca double, align 8
+  %conv = uitofp i32 %a to double
+; ELF64: std
+; ELF64: lfiwzx
+; ELF64: fcfidu
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @uitofp_double_i16(i16 %a, double %b) nounwind ssp {
+entry:
+; ELF64: uitofp_double_i16
+  %b.addr = alloca double, align 8
+  %conv = uitofp i16 %a to double
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidu
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+define void @uitofp_double_i8(i8 %a, double %b) nounwind ssp {
+entry:
+; ELF64: uitofp_double_i8
+  %b.addr = alloca double, align 8
+  %conv = uitofp i8 %a to double
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+; ELF64: std
+; ELF64: lfd
+; ELF64: fcfidu
+  store double %conv, double* %b.addr, align 8
+  ret void
+}
+
+; Test fptosi
+
+define void @fptosi_float_i32(float %a) nounwind ssp {
+entry:
+; ELF64: fptosi_float_i32
+  %b.addr = alloca i32, align 4
+  %conv = fptosi float %a to i32
+; ELF64: fctiwz
+; ELF64: stfd
+; ELF64: lwa
+  store i32 %conv, i32* %b.addr, align 4
+  ret void
+}
+
+define void @fptosi_float_i64(float %a) nounwind ssp {
+entry:
+; ELF64: fptosi_float_i64
+  %b.addr = alloca i64, align 4
+  %conv = fptosi float %a to i64
+; ELF64: fctidz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 4
+  ret void
+}
+
+define void @fptosi_double_i32(double %a) nounwind ssp {
+entry:
+; ELF64: fptosi_double_i32
+  %b.addr = alloca i32, align 8
+  %conv = fptosi double %a to i32
+; ELF64: fctiwz
+; ELF64: stfd
+; ELF64: lwa
+  store i32 %conv, i32* %b.addr, align 8
+  ret void
+}
+
+define void @fptosi_double_i64(double %a) nounwind ssp {
+entry:
+; ELF64: fptosi_double_i64
+  %b.addr = alloca i64, align 8
+  %conv = fptosi double %a to i64
+; ELF64: fctidz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 8
+  ret void
+}
+
+; Test fptoui
+
+define void @fptoui_float_i32(float %a) nounwind ssp {
+entry:
+; ELF64: fptoui_float_i32
+  %b.addr = alloca i32, align 4
+  %conv = fptoui float %a to i32
+; ELF64: fctiwuz
+; ELF64: stfd
+; ELF64: lwz
+  store i32 %conv, i32* %b.addr, align 4
+  ret void
+}
+
+define void @fptoui_float_i64(float %a) nounwind ssp {
+entry:
+; ELF64: fptoui_float_i64
+  %b.addr = alloca i64, align 4
+  %conv = fptoui float %a to i64
+; ELF64: fctiduz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 4
+  ret void
+}
+
+define void @fptoui_double_i32(double %a) nounwind ssp {
+entry:
+; ELF64: fptoui_double_i32
+  %b.addr = alloca i32, align 8
+  %conv = fptoui double %a to i32
+; ELF64: fctiwuz
+; ELF64: stfd
+; ELF64: lwz
+  store i32 %conv, i32* %b.addr, align 8
+  ret void
+}
+
+define void @fptoui_double_i64(double %a) nounwind ssp {
+entry:
+; ELF64: fptoui_double_i64
+  %b.addr = alloca i64, align 8
+  %conv = fptoui double %a to i64
+; ELF64: fctiduz
+; ELF64: stfd
+; ELF64: ld
+  store i64 %conv, i64* %b.addr, align 8
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-crash.ll b/test/CodeGen/PowerPC/fast-isel-crash.ll
new file mode 100644
index 0000000..1813fc9
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-crash.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7
+
+; Ensure this doesn't crash.
+
+%union.anon = type { <16 x i32> }
+
+@__md0 = external global [137 x i8]
+
+define internal void @stretch(<4 x i8> addrspace(1)* %src, <4 x i8> addrspace(1)* %dst, i32 %width, i32 %height, i32 %iLS, i32 %oLS, <2 x float> %c, <4 x float> %param) nounwind {
+entry:
+  ret void
+}
+
+define internal i32 @_Z13get_global_idj(i32 %dim) nounwind ssp {
+entry:
+  ret i32 undef
+}
+
+define void @wrap(i8 addrspace(1)* addrspace(1)* %arglist, i32 addrspace(1)* %gtid) nounwind ssp {
+entry:
+  call void @stretch(<4 x i8> addrspace(1)* undef, <4 x i8> addrspace(1)* undef, i32 undef, i32 undef, i32 undef, i32 undef, <2 x float> undef, <4 x float> undef)
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-ext.ll b/test/CodeGen/PowerPC/fast-isel-ext.ll
new file mode 100644
index 0000000..753305a
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-ext.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+; zext
+
+define i32 @zext_8_32(i8 %a) nounwind ssp {
+; ELF64: zext_8_32
+  %r = zext i8 %a to i32
+; ELF64: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 24, 31
+  ret i32 %r
+}
+
+define i32 @zext_16_32(i16 %a) nounwind ssp {
+; ELF64: zext_16_32
+  %r = zext i16 %a to i32
+; ELF64: rlwinm {{[0-9]+}}, {{[0-9]+}}, 0, 16, 31
+  ret i32 %r
+}
+
+define i64 @zext_8_64(i8 %a) nounwind ssp {
+; ELF64: zext_8_64
+  %r = zext i8 %a to i64
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+  ret i64 %r
+}
+
+define i64 @zext_16_64(i16 %a) nounwind ssp {
+; ELF64: zext_16_64
+  %r = zext i16 %a to i64
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+  ret i64 %r
+}
+
+define i64 @zext_32_64(i32 %a) nounwind ssp {
+; ELF64: zext_32_64
+  %r = zext i32 %a to i64
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
+  ret i64 %r
+}
+
+; sext
+
+define i32 @sext_8_32(i8 %a) nounwind ssp {
+; ELF64: sext_8_32
+  %r = sext i8 %a to i32
+; ELF64: extsb
+  ret i32 %r
+}
+
+define i32 @sext_16_32(i16 %a) nounwind ssp {
+; ELF64: sext_16_32
+  %r = sext i16 %a to i32
+; ELF64: extsh
+  ret i32 %r
+}
+
+define i64 @sext_8_64(i8 %a) nounwind ssp {
+; ELF64: sext_8_64
+  %r = sext i8 %a to i64
+; ELF64: extsb
+  ret i64 %r
+}
+
+define i64 @sext_16_64(i16 %a) nounwind ssp {
+; ELF64: sext_16_64
+  %r = sext i16 %a to i64
+; ELF64: extsh
+  ret i64 %r
+}
+
+define i64 @sext_32_64(i32 %a) nounwind ssp {
+; ELF64: sext_32_64
+  %r = sext i32 %a to i64
+; ELF64: extsw
+  ret i64 %r
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-fold.ll b/test/CodeGen/PowerPC/fast-isel-fold.ll
new file mode 100644
index 0000000..4de345f
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-fold.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+@a = global i8 1, align 1
+@b = global i16 2, align 2
+@c = global i32 4, align 4
+
+define void @t1() nounwind uwtable ssp {
+; ELF64: t1
+  %1 = load i8* @a, align 1
+  call void @foo1(i8 zeroext %1)
+; ELF64: lbz
+; ELF64-NOT: rldicl
+; ELF64-NOT: rlwinm
+  ret void
+}
+
+define void @t2() nounwind uwtable ssp {
+; ELF64: t2
+  %1 = load i16* @b, align 2
+  call void @foo2(i16 zeroext %1)
+; ELF64: lhz
+; ELF64-NOT: rldicl
+; ELF64-NOT: rlwinm
+  ret void
+}
+
+define void @t2a() nounwind uwtable ssp {
+; ELF64: t2a
+  %1 = load i32* @c, align 4
+  call void @foo3(i32 zeroext %1)
+; ELF64: lwz
+; ELF64-NOT: rldicl
+; ELF64-NOT: rlwinm
+  ret void
+}
+
+declare void @foo1(i8 zeroext)
+declare void @foo2(i16 zeroext)
+declare void @foo3(i32 zeroext)
+
+define i32 @t3() nounwind uwtable ssp {
+; ELF64: t3
+  %1 = load i8* @a, align 1
+  %2 = zext i8 %1 to i32
+; ELF64: lbz
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i32 @t4() nounwind uwtable ssp {
+; ELF64: t4
+  %1 = load i16* @b, align 2
+  %2 = zext i16 %1 to i32
+; ELF64: lhz
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i32 @t5() nounwind uwtable ssp {
+; ELF64: t5
+  %1 = load i16* @b, align 2
+  %2 = sext i16 %1 to i32
+; ELF64: lha
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i32 @t6() nounwind uwtable ssp {
+; ELF64: t6
+  %1 = load i8* @a, align 2
+  %2 = sext i8 %1 to i32
+; ELF64: lbz
+; ELF64-NOT: rlwinm
+  ret i32 %2
+}
+
+define i64 @t7() nounwind uwtable ssp {
+; ELF64: t7
+  %1 = load i8* @a, align 1
+  %2 = zext i8 %1 to i64
+; ELF64: lbz
+; ELF64-NOT: rldicl
+  ret i64 %2
+}
+
+define i64 @t8() nounwind uwtable ssp {
+; ELF64: t8
+  %1 = load i16* @b, align 2
+  %2 = zext i16 %1 to i64
+; ELF64: lhz
+; ELF64-NOT: rldicl
+  ret i64 %2
+}
+
+define i64 @t9() nounwind uwtable ssp {
+; ELF64: t9
+  %1 = load i16* @b, align 2
+  %2 = sext i16 %1 to i64
+; ELF64: lha
+; ELF64-NOT: extsh
+  ret i64 %2
+}
+
+define i64 @t10() nounwind uwtable ssp {
+; ELF64: t10
+  %1 = load i8* @a, align 2
+  %2 = sext i8 %1 to i64
+; ELF64: lbz
+; ELF64: extsb
+  ret i64 %2
+}
+
+define i64 @t11() nounwind uwtable ssp {
+; ELF64: t11
+  %1 = load i32* @c, align 4
+  %2 = zext i32 %1 to i64
+; ELF64: lwz
+; ELF64-NOT: rldicl
+  ret i64 %2
+}
+
+define i64 @t12() nounwind uwtable ssp {
+; ELF64: t12
+  %1 = load i32* @c, align 4
+  %2 = sext i32 %1 to i64
+; ELF64: lwa
+; ELF64-NOT: extsw
+  ret i64 %2
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-indirectbr.ll b/test/CodeGen/PowerPC/fast-isel-indirectbr.ll
new file mode 100644
index 0000000..88ccf91
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-indirectbr.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define void @t1(i8* %x) {
+entry:
+; ELF64: t1
+  br label %L0
+
+L0:
+  br label %L1
+
+L1:
+  indirectbr i8* %x, [ label %L0, label %L1 ]
+; ELF64: mtctr 3
+; ELF64: bctr
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-load-store.ll b/test/CodeGen/PowerPC/fast-isel-load-store.ll
new file mode 100644
index 0000000..026b15f
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-load-store.ll
@@ -0,0 +1,202 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+; This test verifies that load/store instructions are properly generated,
+; and that they pass MI verification.
+
+@a = global i8 1, align 1
+@b = global i16 2, align 2
+@c = global i32 4, align 4
+@d = global i64 8, align 8
+@e = global float 1.25, align 4
+@f = global double 3.5, align 8
+
+%struct.s = type<{ i8, i32 }>
+%struct.t = type<{ i8, i64 }>
+
+@g = global %struct.s <{ i8 1, i32 2 }>, align 1
+@h = global %struct.t <{ i8 1, i64 2 }>, align 1
+
+@i = common global [8192 x i64] zeroinitializer, align 8
+
+; load
+
+define i8 @t1() nounwind uwtable ssp {
+; ELF64: t1
+  %1 = load i8* @a, align 1
+; ELF64: lbz
+  %2 = add nsw i8 %1, 1
+; ELF64: addi
+  ret i8 %2
+}
+
+define i16 @t2() nounwind uwtable ssp {
+; ELF64: t2
+  %1 = load i16* @b, align 2
+; ELF64: lhz
+  %2 = add nsw i16 %1, 1
+; ELF64: addi
+  ret i16 %2
+}
+
+define i32 @t3() nounwind uwtable ssp {
+; ELF64: t3
+  %1 = load i32* @c, align 4
+; ELF64: lwz
+  %2 = add nsw i32 %1, 1
+; ELF64: addi
+  ret i32 %2
+}
+
+define i64 @t4() nounwind uwtable ssp {
+; ELF64: t4
+  %1 = load i64* @d, align 4
+; ELF64: ld
+  %2 = add nsw i64 %1, 1
+; ELF64: addi
+  ret i64 %2
+}
+
+define float @t5() nounwind uwtable ssp {
+; ELF64: t5
+  %1 = load float* @e, align 4
+; ELF64: lfs
+  %2 = fadd float %1, 1.0
+; ELF64: fadds
+  ret float %2
+}
+
+define double @t6() nounwind uwtable ssp {
+; ELF64: t6
+  %1 = load double* @f, align 8
+; ELF64: lfd
+  %2 = fadd double %1, 1.0
+; ELF64: fadd
+  ret double %2
+}
+
+; store
+
+define void @t7(i8 %v) nounwind uwtable ssp {
+; ELF64: t7
+  %1 = add nsw i8 %v, 1
+  store i8 %1, i8* @a, align 1
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: stb
+  ret void
+}
+
+define void @t8(i16 %v) nounwind uwtable ssp {
+; ELF64: t8
+  %1 = add nsw i16 %v, 1
+  store i16 %1, i16* @b, align 2
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: sth
+  ret void
+}
+
+define void @t9(i32 %v) nounwind uwtable ssp {
+; ELF64: t9
+  %1 = add nsw i32 %v, 1
+  store i32 %1, i32* @c, align 4
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: stw
+  ret void
+}
+
+define void @t10(i64 %v) nounwind uwtable ssp {
+; ELF64: t10
+  %1 = add nsw i64 %v, 1
+  store i64 %1, i64* @d, align 4
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: std
+  ret void
+}
+
+define void @t11(float %v) nounwind uwtable ssp {
+; ELF64: t11
+  %1 = fadd float %v, 1.0
+  store float %1, float* @e, align 4
+; ELF64: fadds
+; ELF64: stfs
+  ret void
+}
+
+define void @t12(double %v) nounwind uwtable ssp {
+; ELF64: t12
+  %1 = fadd double %v, 1.0
+  store double %1, double* @f, align 8
+; ELF64: fadd
+; ELF64: stfd
+  ret void
+}
+
+;; lwa requires an offset divisible by 4, so we need lwax here.
+define i64 @t13() nounwind uwtable ssp {
+; ELF64: t13
+  %1 = load i32* getelementptr inbounds (%struct.s* @g, i32 0, i32 1), align 1
+  %2 = sext i32 %1 to i64
+; ELF64: li
+; ELF64: lwax
+  %3 = add nsw i64 %2, 1
+; ELF64: addi
+  ret i64 %3
+}
+
+;; ld requires an offset divisible by 4, so we need ldx here.
+define i64 @t14() nounwind uwtable ssp {
+; ELF64: t14
+  %1 = load i64* getelementptr inbounds (%struct.t* @h, i32 0, i32 1), align 1
+; ELF64: li
+; ELF64: ldx
+  %2 = add nsw i64 %1, 1
+; ELF64: addi
+  ret i64 %2
+}
+
+;; std requires an offset divisible by 4, so we need stdx here.
+define void @t15(i64 %v) nounwind uwtable ssp {
+; ELF64: t15
+  %1 = add nsw i64 %v, 1
+  store i64 %1, i64* getelementptr inbounds (%struct.t* @h, i32 0, i32 1), align 1
+; ELF64: addis
+; ELF64: addi
+; ELF64: addi
+; ELF64: li
+; ELF64: stdx
+  ret void
+}
+
+;; ld requires an offset that fits in 16 bits, so we need ldx here.
+define i64 @t16() nounwind uwtable ssp {
+; ELF64: t16
+  %1 = load i64* getelementptr inbounds ([8192 x i64]* @i, i32 0, i64 5000), align 8
+; ELF64: lis
+; ELF64: ori
+; ELF64: ldx
+  %2 = add nsw i64 %1, 1
+; ELF64: addi
+  ret i64 %2
+}
+
+;; std requires an offset that fits in 16 bits, so we need stdx here.
+define void @t17(i64 %v) nounwind uwtable ssp {
+; ELF64: t17
+  %1 = add nsw i64 %v, 1
+  store i64 %1, i64* getelementptr inbounds ([8192 x i64]* @i, i32 0, i64 5000), align 8
+; ELF64: addis
+; ELF64: ld
+; ELF64: addi
+; ELF64: lis
+; ELF64: ori
+; ELF64: stdx
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/fast-isel-redefinition.ll b/test/CodeGen/PowerPC/fast-isel-redefinition.ll
new file mode 100644
index 0000000..72422bd
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-redefinition.ll
@@ -0,0 +1,10 @@
+; RUN: llc -O0 -verify-machineinstrs -fast-isel-abort -optimize-regalloc -regalloc=basic -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s
+; This isn't exactly a useful set of command-line options, but check that it
+; doesn't crash.  (It crashed formerly on ARM, and proved useful in
+; discovering a bug on PowerPC as well.)
+
+define i32 @f(i32* %x) nounwind ssp {
+  %y = getelementptr inbounds i32* %x, i32 5000
+  %tmp103 = load i32* %y, align 4
+  ret i32 %tmp103
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-ret.ll b/test/CodeGen/PowerPC/fast-isel-ret.ll
new file mode 100644
index 0000000..fa19f8b
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-ret.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define signext i8 @ret2(i8 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret2
+; ELF64: extsb
+; ELF64: blr
+  ret i8 %a
+}
+
+define zeroext i8 @ret3(i8 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret3
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 56
+; ELF64: blr
+  ret i8 %a
+}
+
+define signext i16 @ret4(i16 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret4
+; ELF64: extsh
+; ELF64: blr
+  ret i16 %a
+}
+
+define zeroext i16 @ret5(i16 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret5
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64: blr
+  ret i16 %a
+}
+
+define i16 @ret6(i16 %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret6
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 48
+; ELF64: blr
+  ret i16 %a
+}
+
+define signext i32 @ret7(i32 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret7
+; ELF64: extsw
+; ELF64: blr
+  ret i32 %a
+}
+
+define zeroext i32 @ret8(i32 signext %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret8
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
+; ELF64: blr
+  ret i32 %a
+}
+
+define i32 @ret9(i32 %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret9
+; ELF64: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
+; ELF64: blr
+  ret i32 %a
+}
+
+define i64 @ret10(i64 %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret10
+; ELF64-NOT: exts
+; ELF64-NOT: rldicl
+; ELF64: blr
+  ret i64 %a
+}
+
+define float @ret11(float %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret11
+; ELF64: blr
+  ret float %a
+}
+
+define double @ret12(double %a) nounwind uwtable ssp {
+entry:
+; ELF64: ret12
+; ELF64: blr
+  ret double %a
+}
+
+define i8 @ret13() nounwind uwtable ssp {
+entry:
+; ELF64: ret13
+; ELF64: li
+; ELF64: blr
+  ret i8 15;
+}
+
+define i16 @ret14() nounwind uwtable ssp {
+entry:
+; ELF64: ret14
+; ELF64: li
+; ELF64: blr
+  ret i16 -225;
+}
+
+define i32 @ret15() nounwind uwtable ssp {
+entry:
+; ELF64: ret15
+; ELF64: lis
+; ELF64: ori
+; ELF64: blr
+  ret i32 278135;
+}
+
+define i64 @ret16() nounwind uwtable ssp {
+entry:
+; ELF64: ret16
+; ELF64: li
+; ELF64: sldi
+; ELF64: oris
+; ELF64: ori
+; ELF64: blr
+  ret i64 27813515225;
+}
+
+define float @ret17() nounwind uwtable ssp {
+entry:
+; ELF64: ret17
+; ELF64: addis
+; ELF64: lfs
+; ELF64: blr
+  ret float 2.5;
+}
+
+define double @ret18() nounwind uwtable ssp {
+entry:
+; ELF64: ret18
+; ELF64: addis
+; ELF64: lfd
+; ELF64: blr
+  ret double 2.5e-33;
+}
diff --git a/test/CodeGen/PowerPC/fast-isel-shifter.ll b/test/CodeGen/PowerPC/fast-isel-shifter.ll
new file mode 100644
index 0000000..198bfbe
--- /dev/null
+++ b/test/CodeGen/PowerPC/fast-isel-shifter.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -O0 -verify-machineinstrs -fast-isel-abort -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s --check-prefix=ELF64
+
+define i32 @shl() nounwind ssp {
+entry:
+; ELF64: shl
+; ELF64: slw
+  %shl = shl i32 -1, 2
+  ret i32 %shl
+}
+
+define i32 @shl_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ELF64: shl_reg
+; ELF64: slw
+  %shl = shl i32 %src1, %src2
+  ret i32 %shl
+}
+
+define i32 @lshr() nounwind ssp {
+entry:
+; ELF64: lshr
+; ELF64: srw
+  %lshr = lshr i32 -1, 2
+  ret i32 %lshr
+}
+
+define i32 @lshr_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ELF64: lshr_reg
+; ELF64: srw
+  %lshr = lshr i32 %src1, %src2
+  ret i32 %lshr
+}
+
+define i32 @ashr() nounwind ssp {
+entry:
+; ELF64: ashr
+; ELF64: srawi
+  %ashr = ashr i32 -1, 2
+  ret i32 %ashr
+}
+
+define i32 @ashr_reg(i32 %src1, i32 %src2) nounwind ssp {
+entry:
+; ELF64: ashr_reg
+; ELF64: sraw
+  %ashr = ashr i32 %src1, %src2
+  ret i32 %ashr
+}
+
diff --git a/test/CodeGen/PowerPC/fastisel-gep-promote-before-add.ll b/test/CodeGen/PowerPC/fastisel-gep-promote-before-add.ll
new file mode 100644
index 0000000..4bcacf0
--- /dev/null
+++ b/test/CodeGen/PowerPC/fastisel-gep-promote-before-add.ll
@@ -0,0 +1,17 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=powerpc64-unknown-freebsd10.0 %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %add = add i8 64, 64 ; 0x40 + 0x40
+  %0 = load i8** %ptr.addr, align 8
+
+  ; CHECK-LABEL: gep_promotion:
+  ; CHECK: lbz {{[0-9]+}}, 0({{.*}})
+  %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
diff --git a/test/CodeGen/PowerPC/fcpsgn.ll b/test/CodeGen/PowerPC/fcpsgn.ll
new file mode 100644
index 0000000..f469981
--- /dev/null
+++ b/test/CodeGen/PowerPC/fcpsgn.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define double @foo_dd(double %a, double %b) #0 {
+entry:
+  %call = tail call double @copysign(double %a, double %b) #0
+  ret double %call
+
+; CHECK-LABEL: @foo_dd
+; CHECK: fcpsgn 1, 2, 1
+; CHECK: blr
+}
+
+declare double @copysign(double, double) #0
+
+define float @foo_ss(float %a, float %b) #0 {
+entry:
+  %call = tail call float @copysignf(float %a, float %b) #0
+  ret float %call
+
+; CHECK-LABEL: @foo_ss
+; CHECK: fcpsgn 1, 2, 1
+; CHECK: blr
+}
+
+declare float @copysignf(float, float) #0
+
+define float @foo_sd(float %a, double %b) #0 {
+entry:
+  %conv = fptrunc double %b to float
+  %call = tail call float @copysignf(float %a, float %conv) #0
+  ret float %call
+
+; CHECK-LABEL: @foo_sd
+; CHECK: fcpsgn 1, 2, 1
+; CHECK: blr
+}
+
+define double @foo_ds(double %a, float %b) #0 {
+entry:
+  %conv = fpext float %b to double
+  %call = tail call double @copysign(double %a, double %conv) #0
+  ret double %call
+
+; CHECK-LABEL: @foo_ds
+; CHECK: fcpsgn 1, 2, 1
+; CHECK: blr
+}
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/PowerPC/frameaddr.ll b/test/CodeGen/PowerPC/frameaddr.ll
index eabd4a6..4480273 100644
--- a/test/CodeGen/PowerPC/frameaddr.ll
+++ b/test/CodeGen/PowerPC/frameaddr.ll
@@ -40,8 +40,8 @@ declare void @use(i8*)
 
 declare i8* @llvm.frameaddress(i32) #2
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { noreturn nounwind }
 attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind naked "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind naked "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
diff --git a/test/CodeGen/PowerPC/glob-comp-aa-crash.ll b/test/CodeGen/PowerPC/glob-comp-aa-crash.ll
new file mode 100644
index 0000000..f97d0ff
--- /dev/null
+++ b/test/CodeGen/PowerPC/glob-comp-aa-crash.ll
@@ -0,0 +1,139 @@
+; RUN: llc -mtriple=powerpc64-bgq-linux -mcpu=a2 < %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+%"class.std::__1::__assoc_sub_state" = type { %"class.std::__1::__shared_count", %"class.std::__exception_ptr::exception_ptr", %"class.std::__1::mutex", %"class.std::__1::condition_variable", i32 }
+%"class.std::__1::__shared_count" = type { i32 (...)**, i64 }
+%"class.std::__exception_ptr::exception_ptr" = type { i8* }
+%"class.std::__1::mutex" = type { %union.pthread_mutex_t }
+%union.pthread_mutex_t = type { %"struct.<anonymous union>::__pthread_mutex_s" }
+%"struct.<anonymous union>::__pthread_mutex_s" = type { i32, i32, i32, i32, i32, i32, %struct.__pthread_internal_list }
+%struct.__pthread_internal_list = type { %struct.__pthread_internal_list*, %struct.__pthread_internal_list* }
+%"class.std::__1::condition_variable" = type { %union.pthread_cond_t }
+%union.pthread_cond_t = type { %struct.anon }
+%struct.anon = type { i32, i32, i64, i64, i64, i8*, i32, i32 }
+%"class.std::__1::unique_lock" = type { %"class.std::__1::mutex"*, i8 }
+
+declare i32 @__gxx_personality_v0(...)
+
+; Function Attrs: optsize
+define void @_ZNSt3__117__assoc_sub_state4copyEv(%"class.std::__1::__assoc_sub_state"* %this) #0 align 2 {
+entry:
+  %__lk = alloca %"class.std::__1::unique_lock", align 8
+  %ref.tmp = alloca %"class.std::__exception_ptr::exception_ptr", align 8
+  %tmp = alloca { i64, i64 }, align 8
+  %agg.tmp = alloca %"class.std::__exception_ptr::exception_ptr", align 8
+  %__mut_ = getelementptr inbounds %"class.std::__1::__assoc_sub_state"* %this, i64 0, i32 2
+  %__m_.i.i = getelementptr inbounds %"class.std::__1::unique_lock"* %__lk, i64 0, i32 0
+  store %"class.std::__1::mutex"* %__mut_, %"class.std::__1::mutex"** %__m_.i.i, align 8, !tbaa !5
+  %__owns_.i.i = getelementptr inbounds %"class.std::__1::unique_lock"* %__lk, i64 0, i32 1
+  store i8 1, i8* %__owns_.i.i, align 8, !tbaa !6
+  call void @_ZNSt3__15mutex4lockEv(%"class.std::__1::mutex"* %__mut_) #4
+  invoke void @_ZNSt3__117__assoc_sub_state10__sub_waitERNS_11unique_lockINS_5mutexEEE(%"class.std::__1::__assoc_sub_state"* %this, %"class.std::__1::unique_lock"* %__lk) #4
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %__exception_ = getelementptr inbounds %"class.std::__1::__assoc_sub_state"* %this, i64 0, i32 1
+  %0 = bitcast { i64, i64 }* %tmp to i8*
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 16, i32 8, i1 false)
+  call void @_ZNSt15__exception_ptr13exception_ptrC1EMS0_FvvE(%"class.std::__exception_ptr::exception_ptr"* %ref.tmp, { i64, i64 }* byval %tmp) #5
+  %call = call zeroext i1 @_ZNSt15__exception_ptrneERKNS_13exception_ptrES2_(%"class.std::__exception_ptr::exception_ptr"* %__exception_, %"class.std::__exception_ptr::exception_ptr"* %ref.tmp) #5
+  call void @_ZNSt15__exception_ptr13exception_ptrD1Ev(%"class.std::__exception_ptr::exception_ptr"* %ref.tmp) #5
+  br i1 %call, label %if.then, label %if.end
+
+if.then:                                          ; preds = %invoke.cont
+  call void @_ZNSt15__exception_ptr13exception_ptrC1ERKS0_(%"class.std::__exception_ptr::exception_ptr"* %agg.tmp, %"class.std::__exception_ptr::exception_ptr"* %__exception_) #5
+  invoke void @_ZSt17rethrow_exceptionNSt15__exception_ptr13exception_ptrE(%"class.std::__exception_ptr::exception_ptr"* %agg.tmp) #6
+          to label %invoke.cont4 unwind label %lpad3
+
+invoke.cont4:                                     ; preds = %if.then
+  unreachable
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %2 = extractvalue { i8*, i32 } %1, 0
+  %3 = extractvalue { i8*, i32 } %1, 1
+  br label %ehcleanup
+
+lpad3:                                            ; preds = %if.then
+  %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %5 = extractvalue { i8*, i32 } %4, 0
+  %6 = extractvalue { i8*, i32 } %4, 1
+  call void @_ZNSt15__exception_ptr13exception_ptrD1Ev(%"class.std::__exception_ptr::exception_ptr"* %agg.tmp) #5
+  br label %ehcleanup
+
+if.end:                                           ; preds = %invoke.cont
+  %7 = load i8* %__owns_.i.i, align 8, !tbaa !6, !range !4
+  %tobool.i.i = icmp eq i8 %7, 0
+  br i1 %tobool.i.i, label %_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit, label %if.then.i.i
+
+if.then.i.i:                                      ; preds = %if.end
+  %8 = load %"class.std::__1::mutex"** %__m_.i.i, align 8, !tbaa !5
+  call void @_ZNSt3__15mutex6unlockEv(%"class.std::__1::mutex"* %8) #5
+  br label %_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit
+
+_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit:      ; preds = %if.then.i.i, %if.end
+  ret void
+
+ehcleanup:                                        ; preds = %lpad3, %lpad
+  %exn.slot.0 = phi i8* [ %5, %lpad3 ], [ %2, %lpad ]
+  %ehselector.slot.0 = phi i32 [ %6, %lpad3 ], [ %3, %lpad ]
+  %9 = load i8* %__owns_.i.i, align 8, !tbaa !6, !range !4
+  %tobool.i.i9 = icmp eq i8 %9, 0
+  br i1 %tobool.i.i9, label %_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit12, label %if.then.i.i11
+
+if.then.i.i11:                                    ; preds = %ehcleanup
+  %10 = load %"class.std::__1::mutex"** %__m_.i.i, align 8, !tbaa !5
+  call void @_ZNSt3__15mutex6unlockEv(%"class.std::__1::mutex"* %10) #5
+  br label %_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit12
+
+_ZNSt3__111unique_lockINS_5mutexEED1Ev.exit12:    ; preds = %if.then.i.i11, %ehcleanup
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
+  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
+  resume { i8*, i32 } %lpad.val5
+}
+
+; Function Attrs: optsize
+declare void @_ZNSt3__117__assoc_sub_state10__sub_waitERNS_11unique_lockINS_5mutexEEE(%"class.std::__1::__assoc_sub_state"*, %"class.std::__1::unique_lock"*) #0 align 2
+
+; Function Attrs: nounwind optsize
+declare zeroext i1 @_ZNSt15__exception_ptrneERKNS_13exception_ptrES2_(%"class.std::__exception_ptr::exception_ptr"*, %"class.std::__exception_ptr::exception_ptr"*) #1
+
+; Function Attrs: nounwind optsize
+declare void @_ZNSt15__exception_ptr13exception_ptrC1EMS0_FvvE(%"class.std::__exception_ptr::exception_ptr"*, { i64, i64 }* byval) #1
+
+; Function Attrs: nounwind optsize
+declare void @_ZNSt15__exception_ptr13exception_ptrD1Ev(%"class.std::__exception_ptr::exception_ptr"*) #1
+
+; Function Attrs: noreturn optsize
+declare void @_ZSt17rethrow_exceptionNSt15__exception_ptr13exception_ptrE(%"class.std::__exception_ptr::exception_ptr"*) #2
+
+; Function Attrs: nounwind optsize
+declare void @_ZNSt15__exception_ptr13exception_ptrC1ERKS0_(%"class.std::__exception_ptr::exception_ptr"*, %"class.std::__exception_ptr::exception_ptr"*) #1
+
+; Function Attrs: nounwind optsize
+declare void @_ZNSt3__15mutex6unlockEv(%"class.std::__1::mutex"*) #1
+
+; Function Attrs: optsize
+declare void @_ZNSt3__15mutex4lockEv(%"class.std::__1::mutex"*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3
+
+attributes #0 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noreturn optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+attributes #4 = { optsize }
+attributes #5 = { nounwind optsize }
+attributes #6 = { noreturn optsize }
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"bool", metadata !1}
+!4 = metadata !{i8 0, i8 2}
+!5 = metadata !{metadata !0, metadata !0, i64 0}
+!6 = metadata !{metadata !3, metadata !3, i64 0}
diff --git a/test/CodeGen/PowerPC/hello-reloc.s b/test/CodeGen/PowerPC/hello-reloc.s
new file mode 100644
index 0000000..9bbfb38
--- /dev/null
+++ b/test/CodeGen/PowerPC/hello-reloc.s
@@ -0,0 +1,84 @@
+; This tests for the basic implementation of PPCMachObjectWriter.cpp,
+; which is responsible for writing mach-o relocation entries for (PIC)
+; PowerPC objects.
+; NOTE: Darwin PPC asm syntax is not yet supported by PPCAsmParser,
+; so this test case uses ELF PPC asm syntax to produce a mach-o object.
+; Once PPCAsmParser supports darwin asm syntax, this test case should
+; be updated accordingly.  
+
+; RUN: llvm-mc -filetype=obj -relocation-model=pic -mcpu=g4 -triple=powerpc-apple-darwin8 %s -o - | llvm-readobj -relocations | FileCheck -check-prefix=DARWIN-G4-DUMP %s
+
+;	.machine ppc7400
+	.section	__TEXT,__textcoal_nt,coalesced,pure_instructions
+	.section	__TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_main
+	.align	4
+_main:                                  ; @main
+; BB#0:                                 ; %entry
+	mflr 0
+	stw 31, -4(1)
+	stw 0, 8(1)
+	stwu 1, -80(1)
+	bl L0$pb
+L0$pb:
+	mr 31, 1
+	li 5, 0
+	mflr 2
+	stw 3, 68(31)
+	stw 5, 72(31)
+	stw 4, 64(31)
+	addis 2, 2, (L_.str-L0$pb)@ha
+	la 3, (L_.str-L0$pb)@l(2)
+	bl L_puts$stub
+	li 3, 0
+	addi 1, 1, 80
+	lwz 0, 8(1)
+	lwz 31, -4(1)
+	mtlr 0
+	blr
+
+	.section	__TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32
+	.align	4
+L_puts$stub:
+	.indirect_symbol	_puts
+	mflr 0
+	bcl 20, 31, L_puts$stub$tmp
+L_puts$stub$tmp:
+	mflr 11
+	addis 11, 11, (L_puts$lazy_ptr-L_puts$stub$tmp)@ha
+	mtlr 0
+	lwzu 12, (L_puts$lazy_ptr-L_puts$stub$tmp)@l(11)
+	mtctr 12
+	bctr
+	.section	__DATA,__la_symbol_ptr,lazy_symbol_pointers
+L_puts$lazy_ptr:
+	.indirect_symbol	_puts
+	.long	dyld_stub_binding_helper
+
+.subsections_via_symbols
+	.section	__TEXT,__cstring,cstring_literals
+L_.str:                                 ; @.str
+	.asciz	 "Hello, world!"
+
+; DARWIN-G4-DUMP:Format: Mach-O 32-bit ppc
+; DARWIN-G4-DUMP:Arch: powerpc
+; DARWIN-G4-DUMP:AddressSize: 32bit
+; DARWIN-G4-DUMP:Relocations [
+; DARWIN-G4-DUMP:  Section __text {
+; DARWIN-G4-DUMP:    0x34 1 2 0 PPC_RELOC_BR24 0 -
+; DARWIN-G4-DUMP:    0x30 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 _main
+; DARWIN-G4-DUMP:    0x0 0 2 n/a PPC_RELOC_PAIR 1 _main
+; DARWIN-G4-DUMP:    0x2C 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 _main
+; DARWIN-G4-DUMP:    0x60 0 2 n/a PPC_RELOC_PAIR 1 _main
+; DARWIN-G4-DUMP:  }
+; DARWIN-G4-DUMP:  Section __picsymbolstub1 {
+; DARWIN-G4-DUMP:    0x14 0 2 n/a PPC_RELOC_LO16_SECTDIFF 1 _main
+; DARWIN-G4-DUMP:    0x0 0 2 n/a PPC_RELOC_PAIR 1 _main
+; DARWIN-G4-DUMP:    0xC 0 2 n/a PPC_RELOC_HA16_SECTDIFF 1 _main
+; DARWIN-G4-DUMP:    0x18 0 2 n/a PPC_RELOC_PAIR 1 _main
+; DARWIN-G4-DUMP:  }
+; DARWIN-G4-DUMP:  Section __la_symbol_ptr {
+; DARWIN-G4-DUMP:    0x0 0 2 1 PPC_RELOC_VANILLA 0 dyld_stub_binding_helper
+; DARWIN-G4-DUMP:  }
+; DARWIN-G4-DUMP:]
diff --git a/test/CodeGen/PowerPC/i64_fp_round.ll b/test/CodeGen/PowerPC/i64_fp_round.ll
index d2a3239..5770d78 100644
--- a/test/CodeGen/PowerPC/i64_fp_round.ll
+++ b/test/CodeGen/PowerPC/i64_fp_round.ll
@@ -22,6 +22,6 @@ entry:
 ; Also check that with -enable-unsafe-fp-math we do not get that extra
 ; code sequence.  Simply verify that there is no "isel" present.
 
-; RUN: llc -mcpu=pwr7 -mattr=-fpcvt -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=UNSAFE
+; RUN: llc -mcpu=pwr7 -mattr=-fpcvt -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=CHECK-UNSAFE
 ; CHECK-UNSAFE-NOT: isel
 
diff --git a/test/CodeGen/PowerPC/inlineasm-i64-reg.ll b/test/CodeGen/PowerPC/inlineasm-i64-reg.ll
index fa9aa45..5e31cd5 100644
--- a/test/CodeGen/PowerPC/inlineasm-i64-reg.ll
+++ b/test/CodeGen/PowerPC/inlineasm-i64-reg.ll
@@ -59,6 +59,49 @@ entry:
   ret i32 %conv
 }
 
+declare void @mtrace()
+
+define signext i32 @main(i32 signext %argc, i8** %argv) {
+entry:
+  %argc.addr = alloca i32, align 4
+  store i32 %argc, i32* %argc.addr, align 4
+  %0 = call { i64, i64 } asm sideeffect "sc", "={r0},={r3},{r0},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{cr0},~{memory}"(i64 1076)
+  %asmresult1.i = extractvalue { i64, i64 } %0, 1
+  %conv.i = trunc i64 %asmresult1.i to i32
+  %cmp = icmp eq i32 %conv.i, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; CHECK-LABEL: @main
+
+; CHECK-DAG: mr [[REG:[0-9]+]], 3
+; CHECK-DAG: li 0, 1076
+; CHECK:     stw [[REG]],
+
+; CHECK:     #APP
+; CHECK:     sc
+; CHECK:     #NO_APP
+                                      
+; CHECK:     cmpwi {{[0-9]+}}, [[REG]], 1
+
+; CHECK: blr
+
+if.then:                                          ; preds = %entry
+  call void @mtrace()
+  %.pre = load i32* %argc.addr, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %1 = phi i32 [ %.pre, %if.then ], [ %argc, %entry ]
+  %cmp1 = icmp slt i32 %1, 2
+  br i1 %cmp1, label %usage, label %if.end40
+
+usage:    
+  ret i32 8
+
+if.end40:
+  ret i32 0
+}
+
 attributes #0 = { alwaysinline inlinehint nounwind }
 attributes #1 = { nounwind }
 
diff --git a/test/CodeGen/PowerPC/isel-rc-nox0.ll b/test/CodeGen/PowerPC/isel-rc-nox0.ll
index 7d425cc..ac99aa4 100644
--- a/test/CodeGen/PowerPC/isel-rc-nox0.ll
+++ b/test/CodeGen/PowerPC/isel-rc-nox0.ll
@@ -22,7 +22,7 @@ crc32_gentab.exit:                                ; preds = %for.cond1.preheader
 
 for.cond1.preheader.i2961.i:                      ; preds = %for.inc44.i2977.i, %crc32_gentab.exit
   call void @llvm.memset.p0i8.i64(i8* bitcast ([1 x [9 x i32]]* @g_62 to i8*), i8 -1, i64 36, i32 4, i1 false) #1
-  %0 = load i32* %retval.0.i.i.i, align 4, !tbaa !0
+  %0 = load i32* %retval.0.i.i.i, align 4
   %tobool.i2967.i = icmp eq i32 %0, 0
   br label %for.body21.i2968.i
 
@@ -42,9 +42,5 @@ func_80.exit2978.i:                               ; preds = %for.inc44.i2977.i
 ; Function Attrs: nounwind
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "ssp-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "ssp-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/lit.local.cfg b/test/CodeGen/PowerPC/lit.local.cfg
index aaa31d9..2e46300 100644
--- a/test/CodeGen/PowerPC/lit.local.cfg
+++ b/test/CodeGen/PowerPC/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'PowerPC' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/PowerPC/mcm-2.ll b/test/CodeGen/PowerPC/mcm-2.ll
index d4f40f7..fee98d8 100644
--- a/test/CodeGen/PowerPC/mcm-2.ll
+++ b/test/CodeGen/PowerPC/mcm-2.ll
@@ -31,7 +31,9 @@ entry:
 ; LARGE: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
 ; LARGE: lwz {{[0-9]+}}, 0([[REG2]])
 ; LARGE: stw {{[0-9]+}}, 0([[REG2]])
-; LARGE: .type [[VAR]],@object
-; LARGE: .local [[VAR]]
-; LARGE: .comm [[VAR]],4,4
+; LARGE: [[VAR]]:
+; LARGE: .tc [[VAR2:[a-z0-9A-Z_.]+]][TC],[[VAR2]]
+; LARGE: .type [[VAR2]],@object
+; LARGE: .local [[VAR2]]
+; LARGE: .comm [[VAR2]],4,4
 
diff --git a/test/CodeGen/PowerPC/mcm-3.ll b/test/CodeGen/PowerPC/mcm-3.ll
index ce151fb..b6d681d 100644
--- a/test/CodeGen/PowerPC/mcm-3.ll
+++ b/test/CodeGen/PowerPC/mcm-3.ll
@@ -33,9 +33,11 @@ entry:
 ; LARGE: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
 ; LARGE: lwz {{[0-9]+}}, 0([[REG2]])
 ; LARGE: stw {{[0-9]+}}, 0([[REG2]])
-; LARGE: .type [[VAR]],@object
-; LARGE: .data
-; LARGE: .globl [[VAR]]
 ; LARGE: [[VAR]]:
+; LARGE: .tc [[VAR2:[a-z0-9A-Z_.]+]][TC],[[VAR2]]
+; LARGE: .type [[VAR2]],@object
+; LARGE: .data
+; LARGE: .globl [[VAR2]]
+; LARGE: [[VAR2]]:
 ; LARGE: .long 5
 
diff --git a/test/CodeGen/PowerPC/mcm-4.ll b/test/CodeGen/PowerPC/mcm-4.ll
index 7d7b132..73dd902 100644
--- a/test/CodeGen/PowerPC/mcm-4.ll
+++ b/test/CodeGen/PowerPC/mcm-4.ll
@@ -22,6 +22,6 @@ entry:
 ; LARGE: [[VAR:[a-z0-9A-Z_.]+]]:
 ; LARGE: .quad 4562098671269285104
 ; LARGE-LABEL: test_double_const:
-; LARGE: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
-; LARGE: ld [[REG2:[0-9]+]], [[VAR]]@toc@l([[REG1]])
+; LARGE: addis [[REG1:[0-9]+]], 2, [[VAR2:[a-z0-9A-Z_.]+]]@toc@ha
+; LARGE: ld [[REG2:[0-9]+]], [[VAR2]]@toc@l([[REG1]])
 ; LARGE: lfd {{[0-9]+}}, 0([[REG2]])
diff --git a/test/CodeGen/PowerPC/mcm-9.ll b/test/CodeGen/PowerPC/mcm-9.ll
index e587f61..7906b6a 100644
--- a/test/CodeGen/PowerPC/mcm-9.ll
+++ b/test/CodeGen/PowerPC/mcm-9.ll
@@ -7,8 +7,7 @@
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
-@ei = external global i32
-@a = alias i32* @ei
+@a = external global i32
 
 define signext i32 @test_external() nounwind {
 entry:
diff --git a/test/CodeGen/PowerPC/negctr.ll b/test/CodeGen/PowerPC/negctr.ll
index ef33bb7..2e64993 100644
--- a/test/CodeGen/PowerPC/negctr.ll
+++ b/test/CodeGen/PowerPC/negctr.ll
@@ -83,4 +83,4 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/optcmp.ll b/test/CodeGen/PowerPC/optcmp.ll
index 523f329..35aabfa 100644
--- a/test/CodeGen/PowerPC/optcmp.ll
+++ b/test/CodeGen/PowerPC/optcmp.ll
@@ -5,7 +5,7 @@ target triple = "powerpc64-unknown-linux-gnu"
 define signext i32 @foo(i32 signext %a, i32 signext %b, i32* nocapture %c) #0 {
 entry:
   %sub = sub nsw i32 %a, %b
-  store i32 %sub, i32* %c, align 4, !tbaa !0
+  store i32 %sub, i32* %c, align 4
   %cmp = icmp sgt i32 %a, %b
   %cond = select i1 %cmp, i32 %a, i32 %b
   ret i32 %cond
@@ -17,7 +17,7 @@ entry:
 define signext i32 @foo2(i32 signext %a, i32 signext %b, i32* nocapture %c) #0 {
 entry:
   %shl = shl i32 %a, %b
-  store i32 %shl, i32* %c, align 4, !tbaa !0
+  store i32 %shl, i32* %c, align 4
   %cmp = icmp sgt i32 %shl, 0
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -29,7 +29,7 @@ entry:
 define i64 @fool(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %sub = sub nsw i64 %a, %b
-  store i64 %sub, i64* %c, align 8, !tbaa !3
+  store i64 %sub, i64* %c, align 8
   %cmp = icmp sgt i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
   ret i64 %cond
@@ -43,7 +43,7 @@ entry:
 define i64 @foolb(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %sub = sub nsw i64 %a, %b
-  store i64 %sub, i64* %c, align 8, !tbaa !3
+  store i64 %sub, i64* %c, align 8
   %cmp = icmp sle i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
   ret i64 %cond
@@ -57,7 +57,7 @@ entry:
 define i64 @foolc(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %sub = sub nsw i64 %b, %a
-  store i64 %sub, i64* %c, align 8, !tbaa !3
+  store i64 %sub, i64* %c, align 8
   %cmp = icmp sgt i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
   ret i64 %cond
@@ -71,7 +71,7 @@ entry:
 define i64 @foold(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %sub = sub nsw i64 %b, %a
-  store i64 %sub, i64* %c, align 8, !tbaa !3
+  store i64 %sub, i64* %c, align 8
   %cmp = icmp eq i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
   ret i64 %cond
@@ -85,7 +85,7 @@ entry:
 define i64 @foold2(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %sub = sub nsw i64 %a, %b
-  store i64 %sub, i64* %c, align 8, !tbaa !3
+  store i64 %sub, i64* %c, align 8
   %cmp = icmp eq i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
   ret i64 %cond
@@ -99,7 +99,7 @@ entry:
 define i64 @foo2l(i64 %a, i64 %b, i64* nocapture %c) #0 {
 entry:
   %shl = shl i64 %a, %b
-  store i64 %shl, i64* %c, align 8, !tbaa !3
+  store i64 %shl, i64* %c, align 8
   %cmp = icmp sgt i64 %shl, 0
   %conv1 = zext i1 %cmp to i64
   ret i64 %conv1
@@ -112,7 +112,7 @@ entry:
 define double @food(double %a, double %b, double* nocapture %c) #0 {
 entry:
   %sub = fsub double %a, %b
-  store double %sub, double* %c, align 8, !tbaa !3
+  store double %sub, double* %c, align 8
   %cmp = fcmp ogt double %a, %b
   %cond = select i1 %cmp, double %a, double %b
   ret double %cond
@@ -125,7 +125,7 @@ entry:
 define float @foof(float %a, float %b, float* nocapture %c) #0 {
 entry:
   %sub = fsub float %a, %b
-  store float %sub, float* %c, align 4, !tbaa !3
+  store float %sub, float* %c, align 4
   %cmp = fcmp ogt float %a, %b
   %cond = select i1 %cmp, float %a, float %b
   ret float %cond
@@ -135,9 +135,18 @@ entry:
 ; CHECK: stfs 0, 0(5)
 }
 
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"long", metadata !1}
-!4 = metadata !{metadata !"any pointer", metadata !1}
+declare i64 @llvm.ctpop.i64(i64);
+
+define signext i64 @fooct(i64 signext %a, i64 signext %b, i64* nocapture %c) #0 {
+entry:
+  %sub = sub nsw i64 %a, %b
+  %subc = call i64 @llvm.ctpop.i64(i64 %sub)
+  store i64 %subc, i64* %c, align 4
+  %cmp = icmp sgt i64 %subc, 0
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+
+; CHECK: @fooct
+; CHECK-NOT: popcntd.
+}
 
diff --git a/test/CodeGen/PowerPC/pr15031.ll b/test/CodeGen/PowerPC/pr15031.ll
index 5ccf941..e58ad80 100644
--- a/test/CodeGen/PowerPC/pr15031.ll
+++ b/test/CodeGen/PowerPC/pr15031.ll
@@ -317,54 +317,42 @@ if.then:                                          ; preds = %entry
 if.end:                                           ; preds = %entry, %if.then
   %Reg.addr.0 = phi i32 [ %call3, %if.then ], [ %Reg, %entry ]
   %RegNo.i.i = getelementptr inbounds %"class.llvm::MachineOperand"* %this, i64 0, i32 2, i32 0
-  %1 = load i32* %RegNo.i.i, align 4, !tbaa !0
+  %1 = load i32* %RegNo.i.i, align 4
   %cmp.i = icmp eq i32 %1, %Reg.addr.0
   br i1 %cmp.i, label %_ZN4llvm14MachineOperand6setRegEj.exit, label %if.end.i
 
 if.end.i:                                         ; preds = %if.end
   %ParentMI.i.i = getelementptr inbounds %"class.llvm::MachineOperand"* %this, i64 0, i32 3
-  %2 = load %"class.llvm::MachineInstr"** %ParentMI.i.i, align 8, !tbaa !3
+  %2 = load %"class.llvm::MachineInstr"** %ParentMI.i.i, align 8
   %tobool.i = icmp eq %"class.llvm::MachineInstr"* %2, null
   br i1 %tobool.i, label %if.end13.i, label %if.then3.i
 
 if.then3.i:                                       ; preds = %if.end.i
   %Parent.i.i = getelementptr inbounds %"class.llvm::MachineInstr"* %2, i64 0, i32 2
-  %3 = load %"class.llvm::MachineBasicBlock"** %Parent.i.i, align 8, !tbaa !3
+  %3 = load %"class.llvm::MachineBasicBlock"** %Parent.i.i, align 8
   %tobool5.i = icmp eq %"class.llvm::MachineBasicBlock"* %3, null
   br i1 %tobool5.i, label %if.end13.i, label %if.then6.i
 
 if.then6.i:                                       ; preds = %if.then3.i
   %xParent.i.i = getelementptr inbounds %"class.llvm::MachineBasicBlock"* %3, i64 0, i32 4
-  %4 = load %"class.llvm::MachineFunction"** %xParent.i.i, align 8, !tbaa !3
+  %4 = load %"class.llvm::MachineFunction"** %xParent.i.i, align 8
   %tobool8.i = icmp eq %"class.llvm::MachineFunction"* %4, null
   br i1 %tobool8.i, label %if.end13.i, label %if.then9.i
 
 if.then9.i:                                       ; preds = %if.then6.i
   %RegInfo.i.i = getelementptr inbounds %"class.llvm::MachineFunction"* %4, i64 0, i32 5
-  %5 = load %"class.llvm::MachineRegisterInfo"** %RegInfo.i.i, align 8, !tbaa !3
+  %5 = load %"class.llvm::MachineRegisterInfo"** %RegInfo.i.i, align 8
   tail call void @_ZN4llvm19MachineRegisterInfo27removeRegOperandFromUseListEPNS_14MachineOperandE(%"class.llvm::MachineRegisterInfo"* %5, %"class.llvm::MachineOperand"* %this)
-  store i32 %Reg.addr.0, i32* %RegNo.i.i, align 4, !tbaa !0
+  store i32 %Reg.addr.0, i32* %RegNo.i.i, align 4
   tail call void @_ZN4llvm19MachineRegisterInfo22addRegOperandToUseListEPNS_14MachineOperandE(%"class.llvm::MachineRegisterInfo"* %5, %"class.llvm::MachineOperand"* %this)
   br label %_ZN4llvm14MachineOperand6setRegEj.exit
 
 if.end13.i:                                       ; preds = %if.then6.i, %if.then3.i, %if.end.i
-  store i32 %Reg.addr.0, i32* %RegNo.i.i, align 4, !tbaa !0
+  store i32 %Reg.addr.0, i32* %RegNo.i.i, align 4
   br label %_ZN4llvm14MachineOperand6setRegEj.exit
 
 _ZN4llvm14MachineOperand6setRegEj.exit:           ; preds = %if.end, %if.then9.i, %if.end13.i
   ret void
 }
 
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
-!4 = metadata !{metadata !"vtable pointer", metadata !2}
-!5 = metadata !{metadata !"long", metadata !1}
-!6 = metadata !{i64 0, i64 8, metadata !3, i64 8, i64 8, metadata !5}
-!7 = metadata !{metadata !"short", metadata !1}
-!8 = metadata !{i64 0, i64 1, metadata !1, i64 1, i64 4, metadata !0, i64 2, i64 1, metadata !1, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 3, i64 1, metadata !9, i64 4, i64 4, metadata !0, i64 4, i64 4, metadata !0, i64 8, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !5, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 24, i64 8, metadata !3, i64 16, i64 4, metadata !0, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 16, i64 8, metadata !3, i64 24, i64 4, metadata !0}
-!9 = metadata !{metadata !"bool", metadata !1}
-!10 = metadata !{i8 0, i8 2}
-
 ; CHECK-NOT: lbzu 3, 1(3)
diff --git a/test/CodeGen/PowerPC/pr17168.ll b/test/CodeGen/PowerPC/pr17168.ll
new file mode 100644
index 0000000..2848221
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr17168.ll
@@ -0,0 +1,521 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s
+
+; This test formerly failed due to a DBG_VALUE being placed prior to a PHI
+; when fast-isel is partially successful before punting to DAG-isel.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@grid_points = external global [3 x i32], align 4
+
+; Function Attrs: nounwind
+define fastcc void @compute_rhs() #0 {
+entry:
+  br i1 undef, label %for.cond871.preheader.for.inc960_crit_edge, label %for.end1042, !dbg !439
+
+for.cond871.preheader.for.inc960_crit_edge:       ; preds = %for.cond871.preheader.for.inc960_crit_edge, %entry
+  br i1 false, label %for.cond871.preheader.for.inc960_crit_edge, label %for.cond964.preheader, !dbg !439
+
+for.cond964.preheader:                            ; preds = %for.cond871.preheader.for.inc960_crit_edge
+  br i1 undef, label %for.cond968.preheader, label %for.end1042, !dbg !441
+
+for.cond968.preheader:                            ; preds = %for.cond968.preheader, %for.cond964.preheader
+  br i1 false, label %for.cond968.preheader, label %for.end1042, !dbg !441
+
+for.end1042:                                      ; preds = %for.cond968.preheader, %for.cond964.preheader, %entry
+  %0 = phi i32 [ undef, %for.cond964.preheader ], [ undef, %for.cond968.preheader ], [ undef, %entry ]
+  %1 = load i32* getelementptr inbounds ([3 x i32]* @grid_points, i64 0, i64 0), align 4, !dbg !443, !tbaa !444
+  tail call void @llvm.dbg.value(metadata !447, i64 0, metadata !119), !dbg !448
+  %sub10454270 = add nsw i32 %0, -1, !dbg !448
+  %cmp10464271 = icmp sgt i32 %sub10454270, 1, !dbg !448
+  %sub11134263 = add nsw i32 %1, -1, !dbg !450
+  %cmp11144264 = icmp sgt i32 %sub11134263, 1, !dbg !450
+  br i1 %cmp11144264, label %for.cond1116.preheader, label %for.cond1816.preheader.for.inc1898_crit_edge, !dbg !450
+
+for.cond1116.preheader:                           ; preds = %for.inc1658, %for.end1042
+  br i1 %cmp10464271, label %for.body1123, label %for.inc1658, !dbg !452
+
+for.body1123:                                     ; preds = %for.body1123, %for.cond1116.preheader
+  br label %for.body1123, !dbg !455
+
+for.inc1658:                                      ; preds = %for.cond1116.preheader
+  br i1 undef, label %for.cond1116.preheader, label %for.cond1816.preheader.for.inc1898_crit_edge, !dbg !450
+
+for.cond1816.preheader.for.inc1898_crit_edge:     ; preds = %for.cond1816.preheader.for.inc1898_crit_edge, %for.inc1658, %for.end1042
+  br label %for.cond1816.preheader.for.inc1898_crit_edge, !dbg !458
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!438, !464}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4 (trunk 190311)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !298, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"bt.c", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4, metadata !82, metadata !102, metadata !114, metadata !132, metadata !145, metadata !154, metadata !155, metadata !162, metadata !183, metadata !200, metadata !201, metadata !207, metadata !208, metadata !215, metadata !221, metadata !230, metadata !238, metadata !246, metadata !255, metadata !260, metadata !261, metadata !268, metadata !274, metadata !279, metadata !280, metadata !287, metadata !293}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"main", metadata !"main", metadata !"", i32 74, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !12, i32 74} ; [ DW_TAG_subprogram ] [line 74] [def] [main]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !8, metadata !9}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!10 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from char]
+!11 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_unsigned_char]
+!12 = metadata !{metadata !13, metadata !14, metadata !15, metadata !16, metadata !17, metadata !18, metadata !19, metadata !21, metadata !22, metadata !23, metadata !25, metadata !26}
+!13 = metadata !{i32 786689, metadata !4, metadata !"argc", metadata !5, i32 16777290, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argc] [line 74]
+!14 = metadata !{i32 786689, metadata !4, metadata !"argv", metadata !5, i32 33554506, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [argv] [line 74]
+!15 = metadata !{i32 786688, metadata !4, metadata !"niter", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [niter] [line 76]
+!16 = metadata !{i32 786688, metadata !4, metadata !"step", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [step] [line 76]
+!17 = metadata !{i32 786688, metadata !4, metadata !"n3", metadata !5, i32 76, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n3] [line 76]
+!18 = metadata !{i32 786688, metadata !4, metadata !"nthreads", metadata !5, i32 77, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [nthreads] [line 77]
+!19 = metadata !{i32 786688, metadata !4, metadata !"navg", metadata !5, i32 78, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [navg] [line 78]
+!20 = metadata !{i32 786468, null, null, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ] [double] [line 0, size 64, align 64, offset 0, enc DW_ATE_float]
+!21 = metadata !{i32 786688, metadata !4, metadata !"mflops", metadata !5, i32 78, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [mflops] [line 78]
+!22 = metadata !{i32 786688, metadata !4, metadata !"tmax", metadata !5, i32 80, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [tmax] [line 80]
+!23 = metadata !{i32 786688, metadata !4, metadata !"verified", metadata !5, i32 81, metadata !24, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [verified] [line 81]
+!24 = metadata !{i32 786454, metadata !1, null, metadata !"boolean", i32 12, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ] [boolean] [line 12, size 0, align 0, offset 0] [from int]
+!25 = metadata !{i32 786688, metadata !4, metadata !"class", metadata !5, i32 82, metadata !11, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [class] [line 82]
+!26 = metadata !{i32 786688, metadata !4, metadata !"fp", metadata !5, i32 83, metadata !27, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [fp] [line 83]
+!27 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !28} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from FILE]
+!28 = metadata !{i32 786454, metadata !1, null, metadata !"FILE", i32 49, i64 0, i64 0, i64 0, i32 0, metadata !29} ; [ DW_TAG_typedef ] [FILE] [line 49, size 0, align 0, offset 0] [from _IO_FILE]
+!29 = metadata !{i32 786451, metadata !30, null, metadata !"_IO_FILE", i32 271, i64 1728, i64 64, i32 0, i32 0, null, metadata !31, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [_IO_FILE] [line 271, size 1728, align 64, offset 0] [def] [from ]
+!30 = metadata !{metadata !"/usr/include/libio.h", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
+!31 = metadata !{metadata !32, metadata !33, metadata !34, metadata !35, metadata !36, metadata !37, metadata !38, metadata !39, metadata !40, metadata !41, metadata !42, metadata !43, metadata !44, metadata !52, metadata !53, metadata !54, metadata !55, metadata !58, metadata !60, metadata !62, metadata !66, metadata !68, metadata !70, metadata !71, metadata !72, metadata !73, metadata !74, metadata !77, metadata !78}
+!32 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_flags", i32 272, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [_flags] [line 272, size 32, align 32, offset 0] [from int]
+!33 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_ptr", i32 277, i64 64, i64 64, i64 64, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_ptr] [line 277, size 64, align 64, offset 64] [from ]
+!34 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_end", i32 278, i64 64, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_end] [line 278, size 64, align 64, offset 128] [from ]
+!35 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_read_base", i32 279, i64 64, i64 64, i64 192, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_read_base] [line 279, size 64, align 64, offset 192] [from ]
+!36 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_base", i32 280, i64 64, i64 64, i64 256, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_base] [line 280, size 64, align 64, offset 256] [from ]
+!37 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_ptr", i32 281, i64 64, i64 64, i64 320, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_ptr] [line 281, size 64, align 64, offset 320] [from ]
+!38 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_write_end", i32 282, i64 64, i64 64, i64 384, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_write_end] [line 282, size 64, align 64, offset 384] [from ]
+!39 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_buf_base", i32 283, i64 64, i64 64, i64 448, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_buf_base] [line 283, size 64, align 64, offset 448] [from ]
+!40 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_buf_end", i32 284, i64 64, i64 64, i64 512, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_buf_end] [line 284, size 64, align 64, offset 512] [from ]
+!41 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_save_base", i32 286, i64 64, i64 64, i64 576, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_save_base] [line 286, size 64, align 64, offset 576] [from ]
+!42 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_backup_base", i32 287, i64 64, i64 64, i64 640, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_backup_base] [line 287, size 64, align 64, offset 640] [from ]
+!43 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_IO_save_end", i32 288, i64 64, i64 64, i64 704, i32 0, metadata !10} ; [ DW_TAG_member ] [_IO_save_end] [line 288, size 64, align 64, offset 704] [from ]
+!44 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_markers", i32 290, i64 64, i64 64, i64 768, i32 0, metadata !45} ; [ DW_TAG_member ] [_markers] [line 290, size 64, align 64, offset 768] [from ]
+!45 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !46} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_marker]
+!46 = metadata !{i32 786451, metadata !30, null, metadata !"_IO_marker", i32 186, i64 192, i64 64, i32 0, i32 0, null, metadata !47, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [_IO_marker] [line 186, size 192, align 64, offset 0] [def] [from ]
+!47 = metadata !{metadata !48, metadata !49, metadata !51}
+!48 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_next", i32 187, i64 64, i64 64, i64 0, i32 0, metadata !45} ; [ DW_TAG_member ] [_next] [line 187, size 64, align 64, offset 0] [from ]
+!49 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_sbuf", i32 188, i64 64, i64 64, i64 64, i32 0, metadata !50} ; [ DW_TAG_member ] [_sbuf] [line 188, size 64, align 64, offset 64] [from ]
+!50 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !29} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from _IO_FILE]
+!51 = metadata !{i32 786445, metadata !30, metadata !46, metadata !"_pos", i32 192, i64 32, i64 32, i64 128, i32 0, metadata !8} ; [ DW_TAG_member ] [_pos] [line 192, size 32, align 32, offset 128] [from int]
+!52 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_chain", i32 292, i64 64, i64 64, i64 832, i32 0, metadata !50} ; [ DW_TAG_member ] [_chain] [line 292, size 64, align 64, offset 832] [from ]
+!53 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_fileno", i32 294, i64 32, i64 32, i64 896, i32 0, metadata !8} ; [ DW_TAG_member ] [_fileno] [line 294, size 32, align 32, offset 896] [from int]
+!54 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_flags2", i32 298, i64 32, i64 32, i64 928, i32 0, metadata !8} ; [ DW_TAG_member ] [_flags2] [line 298, size 32, align 32, offset 928] [from int]
+!55 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_old_offset", i32 300, i64 64, i64 64, i64 960, i32 0, metadata !56} ; [ DW_TAG_member ] [_old_offset] [line 300, size 64, align 64, offset 960] [from __off_t]
+!56 = metadata !{i32 786454, metadata !30, null, metadata !"__off_t", i32 141, i64 0, i64 0, i64 0, i32 0, metadata !57} ; [ DW_TAG_typedef ] [__off_t] [line 141, size 0, align 0, offset 0] [from long int]
+!57 = metadata !{i32 786468, null, null, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [long int] [line 0, size 64, align 64, offset 0, enc DW_ATE_signed]
+!58 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_cur_column", i32 304, i64 16, i64 16, i64 1024, i32 0, metadata !59} ; [ DW_TAG_member ] [_cur_column] [line 304, size 16, align 16, offset 1024] [from unsigned short]
+!59 = metadata !{i32 786468, null, null, metadata !"unsigned short", i32 0, i64 16, i64 16, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [unsigned short] [line 0, size 16, align 16, offset 0, enc DW_ATE_unsigned]
+!60 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_vtable_offset", i32 305, i64 8, i64 8, i64 1040, i32 0, metadata !61} ; [ DW_TAG_member ] [_vtable_offset] [line 305, size 8, align 8, offset 1040] [from signed char]
+!61 = metadata !{i32 786468, null, null, metadata !"signed char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [signed char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
+!62 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_shortbuf", i32 306, i64 8, i64 8, i64 1048, i32 0, metadata !63} ; [ DW_TAG_member ] [_shortbuf] [line 306, size 8, align 8, offset 1048] [from ]
+!63 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 8, i64 8, i32 0, i32 0, metadata !11, metadata !64, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
+!64 = metadata !{metadata !65}
+!65 = metadata !{i32 786465, i64 0, i64 1}        ; [ DW_TAG_subrange_type ] [0, 0]
+!66 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_lock", i32 310, i64 64, i64 64, i64 1088, i32 0, metadata !67} ; [ DW_TAG_member ] [_lock] [line 310, size 64, align 64, offset 1088] [from ]
+!67 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!68 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_offset", i32 319, i64 64, i64 64, i64 1152, i32 0, metadata !69} ; [ DW_TAG_member ] [_offset] [line 319, size 64, align 64, offset 1152] [from __off64_t]
+!69 = metadata !{i32 786454, metadata !30, null, metadata !"__off64_t", i32 142, i64 0, i64 0, i64 0, i32 0, metadata !57} ; [ DW_TAG_typedef ] [__off64_t] [line 142, size 0, align 0, offset 0] [from long int]
+!70 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad1", i32 328, i64 64, i64 64, i64 1216, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad1] [line 328, size 64, align 64, offset 1216] [from ]
+!71 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad2", i32 329, i64 64, i64 64, i64 1280, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad2] [line 329, size 64, align 64, offset 1280] [from ]
+!72 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad3", i32 330, i64 64, i64 64, i64 1344, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad3] [line 330, size 64, align 64, offset 1344] [from ]
+!73 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad4", i32 331, i64 64, i64 64, i64 1408, i32 0, metadata !67} ; [ DW_TAG_member ] [__pad4] [line 331, size 64, align 64, offset 1408] [from ]
+!74 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"__pad5", i32 332, i64 64, i64 64, i64 1472, i32 0, metadata !75} ; [ DW_TAG_member ] [__pad5] [line 332, size 64, align 64, offset 1472] [from size_t]
+!75 = metadata !{i32 786454, metadata !30, null, metadata !"size_t", i32 42, i64 0, i64 0, i64 0, i32 0, metadata !76} ; [ DW_TAG_typedef ] [size_t] [line 42, size 0, align 0, offset 0] [from long unsigned int]
+!76 = metadata !{i32 786468, null, null, metadata !"long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!77 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_mode", i32 334, i64 32, i64 32, i64 1536, i32 0, metadata !8} ; [ DW_TAG_member ] [_mode] [line 334, size 32, align 32, offset 1536] [from int]
+!78 = metadata !{i32 786445, metadata !30, metadata !29, metadata !"_unused2", i32 336, i64 160, i64 8, i64 1568, i32 0, metadata !79} ; [ DW_TAG_member ] [_unused2] [line 336, size 160, align 8, offset 1568] [from ]
+!79 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !11, metadata !80, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!80 = metadata !{metadata !81}
+!81 = metadata !{i32 786465, i64 0, i64 20}       ; [ DW_TAG_subrange_type ] [0, 19]
+!82 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"verify", metadata !"verify", metadata !"", i32 2388, metadata !83, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !86, i32 2388} ; [ DW_TAG_subprogram ] [line 2388] [local] [def] [verify]
+!83 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !84, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!84 = metadata !{null, metadata !8, metadata !10, metadata !85}
+!85 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from boolean]
+!86 = metadata !{metadata !87, metadata !88, metadata !89, metadata !90, metadata !94, metadata !95, metadata !96, metadata !97, metadata !98, metadata !99, metadata !100, metadata !101}
+!87 = metadata !{i32 786689, metadata !82, metadata !"no_time_steps", metadata !5, i32 16779604, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [no_time_steps] [line 2388]
+!88 = metadata !{i32 786689, metadata !82, metadata !"class", metadata !5, i32 33556820, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [class] [line 2388]
+!89 = metadata !{i32 786689, metadata !82, metadata !"verified", metadata !5, i32 50334036, metadata !85, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [verified] [line 2388]
+!90 = metadata !{i32 786688, metadata !82, metadata !"xcrref", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcrref] [line 2397]
+!91 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 320, i64 64, i32 0, i32 0, metadata !20, metadata !92, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 320, align 64, offset 0] [from double]
+!92 = metadata !{metadata !93}
+!93 = metadata !{i32 786465, i64 0, i64 5}        ; [ DW_TAG_subrange_type ] [0, 4]
+!94 = metadata !{i32 786688, metadata !82, metadata !"xceref", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xceref] [line 2397]
+!95 = metadata !{i32 786688, metadata !82, metadata !"xcrdif", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcrdif] [line 2397]
+!96 = metadata !{i32 786688, metadata !82, metadata !"xcedif", metadata !5, i32 2397, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcedif] [line 2397]
+!97 = metadata !{i32 786688, metadata !82, metadata !"epsilon", metadata !5, i32 2398, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [epsilon] [line 2398]
+!98 = metadata !{i32 786688, metadata !82, metadata !"xce", metadata !5, i32 2398, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xce] [line 2398]
+!99 = metadata !{i32 786688, metadata !82, metadata !"xcr", metadata !5, i32 2398, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xcr] [line 2398]
+!100 = metadata !{i32 786688, metadata !82, metadata !"dtref", metadata !5, i32 2398, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtref] [line 2398]
+!101 = metadata !{i32 786688, metadata !82, metadata !"m", metadata !5, i32 2399, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 2399]
+!102 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"rhs_norm", metadata !"rhs_norm", metadata !"", i32 266, metadata !103, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !106, i32 266} ; [ DW_TAG_subprogram ] [line 266] [local] [def] [rhs_norm]
+!103 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !104, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!104 = metadata !{null, metadata !105}
+!105 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !20} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from double]
+!106 = metadata !{metadata !107, metadata !108, metadata !109, metadata !110, metadata !111, metadata !112, metadata !113}
+!107 = metadata !{i32 786689, metadata !102, metadata !"rms", metadata !5, i32 16777482, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [rms] [line 266]
+!108 = metadata !{i32 786688, metadata !102, metadata !"i", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 271]
+!109 = metadata !{i32 786688, metadata !102, metadata !"j", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 271]
+!110 = metadata !{i32 786688, metadata !102, metadata !"k", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 271]
+!111 = metadata !{i32 786688, metadata !102, metadata !"d", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 271]
+!112 = metadata !{i32 786688, metadata !102, metadata !"m", metadata !5, i32 271, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 271]
+!113 = metadata !{i32 786688, metadata !102, metadata !"add", metadata !5, i32 272, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [add] [line 272]
+!114 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"compute_rhs", metadata !"compute_rhs", metadata !"", i32 1767, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, void ()* @compute_rhs, null, null, metadata !117, i32 1767} ; [ DW_TAG_subprogram ] [line 1767] [local] [def] [compute_rhs]
+!115 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !116, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!116 = metadata !{null}
+!117 = metadata !{metadata !118, metadata !119, metadata !120, metadata !121, metadata !122, metadata !123, metadata !124, metadata !125, metadata !126, metadata !127, metadata !128, metadata !129, metadata !130, metadata !131}
+!118 = metadata !{i32 786688, metadata !114, metadata !"i", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1769]
+!119 = metadata !{i32 786688, metadata !114, metadata !"j", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1769]
+!120 = metadata !{i32 786688, metadata !114, metadata !"k", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1769]
+!121 = metadata !{i32 786688, metadata !114, metadata !"m", metadata !5, i32 1769, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 1769]
+!122 = metadata !{i32 786688, metadata !114, metadata !"rho_inv", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [rho_inv] [line 1770]
+!123 = metadata !{i32 786688, metadata !114, metadata !"uijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [uijk] [line 1770]
+!124 = metadata !{i32 786688, metadata !114, metadata !"up1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [up1] [line 1770]
+!125 = metadata !{i32 786688, metadata !114, metadata !"um1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [um1] [line 1770]
+!126 = metadata !{i32 786688, metadata !114, metadata !"vijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vijk] [line 1770]
+!127 = metadata !{i32 786688, metadata !114, metadata !"vp1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vp1] [line 1770]
+!128 = metadata !{i32 786688, metadata !114, metadata !"vm1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [vm1] [line 1770]
+!129 = metadata !{i32 786688, metadata !114, metadata !"wijk", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wijk] [line 1770]
+!130 = metadata !{i32 786688, metadata !114, metadata !"wp1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wp1] [line 1770]
+!131 = metadata !{i32 786688, metadata !114, metadata !"wm1", metadata !5, i32 1770, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [wm1] [line 1770]
+!132 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"error_norm", metadata !"error_norm", metadata !"", i32 225, metadata !103, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !133, i32 225} ; [ DW_TAG_subprogram ] [line 225] [local] [def] [error_norm]
+!133 = metadata !{metadata !134, metadata !135, metadata !136, metadata !137, metadata !138, metadata !139, metadata !140, metadata !141, metadata !142, metadata !143, metadata !144}
+!134 = metadata !{i32 786689, metadata !132, metadata !"rms", metadata !5, i32 16777441, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [rms] [line 225]
+!135 = metadata !{i32 786688, metadata !132, metadata !"i", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 232]
+!136 = metadata !{i32 786688, metadata !132, metadata !"j", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 232]
+!137 = metadata !{i32 786688, metadata !132, metadata !"k", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 232]
+!138 = metadata !{i32 786688, metadata !132, metadata !"m", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 232]
+!139 = metadata !{i32 786688, metadata !132, metadata !"d", metadata !5, i32 232, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [d] [line 232]
+!140 = metadata !{i32 786688, metadata !132, metadata !"xi", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 233]
+!141 = metadata !{i32 786688, metadata !132, metadata !"eta", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 233]
+!142 = metadata !{i32 786688, metadata !132, metadata !"zeta", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 233]
+!143 = metadata !{i32 786688, metadata !132, metadata !"u_exact", metadata !5, i32 233, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [u_exact] [line 233]
+!144 = metadata !{i32 786688, metadata !132, metadata !"add", metadata !5, i32 233, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [add] [line 233]
+!145 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"exact_solution", metadata !"exact_solution", metadata !"", i32 643, metadata !146, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !148, i32 644} ; [ DW_TAG_subprogram ] [line 643] [local] [def] [scope 644] [exact_solution]
+!146 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !147, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!147 = metadata !{null, metadata !20, metadata !20, metadata !20, metadata !105}
+!148 = metadata !{metadata !149, metadata !150, metadata !151, metadata !152, metadata !153}
+!149 = metadata !{i32 786689, metadata !145, metadata !"xi", metadata !5, i32 16777859, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [xi] [line 643]
+!150 = metadata !{i32 786689, metadata !145, metadata !"eta", metadata !5, i32 33555075, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [eta] [line 643]
+!151 = metadata !{i32 786689, metadata !145, metadata !"zeta", metadata !5, i32 50332291, metadata !20, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [zeta] [line 643]
+!152 = metadata !{i32 786689, metadata !145, metadata !"dtemp", metadata !5, i32 67109508, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [dtemp] [line 644]
+!153 = metadata !{i32 786688, metadata !145, metadata !"m", metadata !5, i32 653, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 653]
+!154 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"set_constants", metadata !"set_constants", metadata !"", i32 2191, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 2191} ; [ DW_TAG_subprogram ] [line 2191] [local] [def] [set_constants]
+!155 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsinit", metadata !"lhsinit", metadata !"", i32 855, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !156, i32 855} ; [ DW_TAG_subprogram ] [line 855] [local] [def] [lhsinit]
+!156 = metadata !{metadata !157, metadata !158, metadata !159, metadata !160, metadata !161}
+!157 = metadata !{i32 786688, metadata !155, metadata !"i", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 857]
+!158 = metadata !{i32 786688, metadata !155, metadata !"j", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 857]
+!159 = metadata !{i32 786688, metadata !155, metadata !"k", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 857]
+!160 = metadata !{i32 786688, metadata !155, metadata !"m", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 857]
+!161 = metadata !{i32 786688, metadata !155, metadata !"n", metadata !5, i32 857, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 857]
+!162 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"initialize", metadata !"initialize", metadata !"", i32 669, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !163, i32 669} ; [ DW_TAG_subprogram ] [line 669] [local] [def] [initialize]
+!163 = metadata !{metadata !164, metadata !165, metadata !166, metadata !167, metadata !168, metadata !169, metadata !170, metadata !171, metadata !172, metadata !173, metadata !174, metadata !179, metadata !180, metadata !181, metadata !182}
+!164 = metadata !{i32 786688, metadata !162, metadata !"i", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 679]
+!165 = metadata !{i32 786688, metadata !162, metadata !"j", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 679]
+!166 = metadata !{i32 786688, metadata !162, metadata !"k", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 679]
+!167 = metadata !{i32 786688, metadata !162, metadata !"m", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 679]
+!168 = metadata !{i32 786688, metadata !162, metadata !"ix", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ix] [line 679]
+!169 = metadata !{i32 786688, metadata !162, metadata !"iy", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [iy] [line 679]
+!170 = metadata !{i32 786688, metadata !162, metadata !"iz", metadata !5, i32 679, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [iz] [line 679]
+!171 = metadata !{i32 786688, metadata !162, metadata !"xi", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 680]
+!172 = metadata !{i32 786688, metadata !162, metadata !"eta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 680]
+!173 = metadata !{i32 786688, metadata !162, metadata !"zeta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 680]
+!174 = metadata !{i32 786688, metadata !162, metadata !"Pface", metadata !5, i32 680, metadata !175, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pface] [line 680]
+!175 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1920, i64 64, i32 0, i32 0, metadata !20, metadata !176, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1920, align 64, offset 0] [from double]
+!176 = metadata !{metadata !177, metadata !178, metadata !93}
+!177 = metadata !{i32 786465, i64 0, i64 2}       ; [ DW_TAG_subrange_type ] [0, 1]
+!178 = metadata !{i32 786465, i64 0, i64 3}       ; [ DW_TAG_subrange_type ] [0, 2]
+!179 = metadata !{i32 786688, metadata !162, metadata !"Pxi", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pxi] [line 680]
+!180 = metadata !{i32 786688, metadata !162, metadata !"Peta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Peta] [line 680]
+!181 = metadata !{i32 786688, metadata !162, metadata !"Pzeta", metadata !5, i32 680, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [Pzeta] [line 680]
+!182 = metadata !{i32 786688, metadata !162, metadata !"temp", metadata !5, i32 680, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [temp] [line 680]
+!183 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"exact_rhs", metadata !"exact_rhs", metadata !"", i32 301, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !184, i32 301} ; [ DW_TAG_subprogram ] [line 301] [local] [def] [exact_rhs]
+!184 = metadata !{metadata !185, metadata !186, metadata !187, metadata !188, metadata !189, metadata !190, metadata !191, metadata !192, metadata !193, metadata !194, metadata !195, metadata !196, metadata !197, metadata !198, metadata !199}
+!185 = metadata !{i32 786688, metadata !183, metadata !"dtemp", metadata !5, i32 310, metadata !91, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtemp] [line 310]
+!186 = metadata !{i32 786688, metadata !183, metadata !"xi", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [xi] [line 310]
+!187 = metadata !{i32 786688, metadata !183, metadata !"eta", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [eta] [line 310]
+!188 = metadata !{i32 786688, metadata !183, metadata !"zeta", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [zeta] [line 310]
+!189 = metadata !{i32 786688, metadata !183, metadata !"dtpp", metadata !5, i32 310, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [dtpp] [line 310]
+!190 = metadata !{i32 786688, metadata !183, metadata !"m", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 311]
+!191 = metadata !{i32 786688, metadata !183, metadata !"i", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 311]
+!192 = metadata !{i32 786688, metadata !183, metadata !"j", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 311]
+!193 = metadata !{i32 786688, metadata !183, metadata !"k", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 311]
+!194 = metadata !{i32 786688, metadata !183, metadata !"ip1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ip1] [line 311]
+!195 = metadata !{i32 786688, metadata !183, metadata !"im1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [im1] [line 311]
+!196 = metadata !{i32 786688, metadata !183, metadata !"jp1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jp1] [line 311]
+!197 = metadata !{i32 786688, metadata !183, metadata !"jm1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jm1] [line 311]
+!198 = metadata !{i32 786688, metadata !183, metadata !"km1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [km1] [line 311]
+!199 = metadata !{i32 786688, metadata !183, metadata !"kp1", metadata !5, i32 311, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [kp1] [line 311]
+!200 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"adi", metadata !"adi", metadata !"", i32 210, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 210} ; [ DW_TAG_subprogram ] [line 210] [local] [def] [adi]
+!201 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"add", metadata !"add", metadata !"", i32 187, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !202, i32 187} ; [ DW_TAG_subprogram ] [line 187] [local] [def] [add]
+!202 = metadata !{metadata !203, metadata !204, metadata !205, metadata !206}
+!203 = metadata !{i32 786688, metadata !201, metadata !"i", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 193]
+!204 = metadata !{i32 786688, metadata !201, metadata !"j", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 193]
+!205 = metadata !{i32 786688, metadata !201, metadata !"k", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 193]
+!206 = metadata !{i32 786688, metadata !201, metadata !"m", metadata !5, i32 193, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 193]
+!207 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_solve", metadata !"z_solve", metadata !"", i32 3457, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 3457} ; [ DW_TAG_subprogram ] [line 3457] [local] [def] [z_solve]
+!208 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_backsubstitute", metadata !"z_backsubstitute", metadata !"", i32 3480, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !209, i32 3480} ; [ DW_TAG_subprogram ] [line 3480] [local] [def] [z_backsubstitute]
+!209 = metadata !{metadata !210, metadata !211, metadata !212, metadata !213, metadata !214}
+!210 = metadata !{i32 786688, metadata !208, metadata !"i", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3492]
+!211 = metadata !{i32 786688, metadata !208, metadata !"j", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3492]
+!212 = metadata !{i32 786688, metadata !208, metadata !"k", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3492]
+!213 = metadata !{i32 786688, metadata !208, metadata !"m", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 3492]
+!214 = metadata !{i32 786688, metadata !208, metadata !"n", metadata !5, i32 3492, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 3492]
+!215 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"z_solve_cell", metadata !"z_solve_cell", metadata !"", i32 3512, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !216, i32 3512} ; [ DW_TAG_subprogram ] [line 3512] [local] [def] [z_solve_cell]
+!216 = metadata !{metadata !217, metadata !218, metadata !219, metadata !220}
+!217 = metadata !{i32 786688, metadata !215, metadata !"i", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3527]
+!218 = metadata !{i32 786688, metadata !215, metadata !"j", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3527]
+!219 = metadata !{i32 786688, metadata !215, metadata !"k", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3527]
+!220 = metadata !{i32 786688, metadata !215, metadata !"ksize", metadata !5, i32 3527, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [ksize] [line 3527]
+!221 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"binvrhs", metadata !"binvrhs", metadata !"", i32 3154, metadata !222, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !225, i32 3154} ; [ DW_TAG_subprogram ] [line 3154] [local] [def] [binvrhs]
+!222 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !223, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!223 = metadata !{null, metadata !224, metadata !105}
+!224 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !91} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from ]
+!225 = metadata !{metadata !226, metadata !227, metadata !228, metadata !229}
+!226 = metadata !{i32 786689, metadata !221, metadata !"lhs", metadata !5, i32 16780370, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [lhs] [line 3154]
+!227 = metadata !{i32 786689, metadata !221, metadata !"r", metadata !5, i32 33557586, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 3154]
+!228 = metadata !{i32 786688, metadata !221, metadata !"pivot", metadata !5, i32 3159, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [pivot] [line 3159]
+!229 = metadata !{i32 786688, metadata !221, metadata !"coeff", metadata !5, i32 3159, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [coeff] [line 3159]
+!230 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"matmul_sub", metadata !"matmul_sub", metadata !"", i32 2841, metadata !231, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !233, i32 2842} ; [ DW_TAG_subprogram ] [line 2841] [local] [def] [scope 2842] [matmul_sub]
+!231 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !232, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!232 = metadata !{null, metadata !224, metadata !224, metadata !224}
+!233 = metadata !{metadata !234, metadata !235, metadata !236, metadata !237}
+!234 = metadata !{i32 786689, metadata !230, metadata !"ablock", metadata !5, i32 16780057, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ablock] [line 2841]
+!235 = metadata !{i32 786689, metadata !230, metadata !"bblock", metadata !5, i32 33557273, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bblock] [line 2841]
+!236 = metadata !{i32 786689, metadata !230, metadata !"cblock", metadata !5, i32 50334490, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [cblock] [line 2842]
+!237 = metadata !{i32 786688, metadata !230, metadata !"j", metadata !5, i32 2851, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2851]
+!238 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"matvec_sub", metadata !"matvec_sub", metadata !"", i32 2814, metadata !239, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !241, i32 2814} ; [ DW_TAG_subprogram ] [line 2814] [local] [def] [matvec_sub]
+!239 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !240, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!240 = metadata !{null, metadata !224, metadata !105, metadata !105}
+!241 = metadata !{metadata !242, metadata !243, metadata !244, metadata !245}
+!242 = metadata !{i32 786689, metadata !238, metadata !"ablock", metadata !5, i32 16780030, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [ablock] [line 2814]
+!243 = metadata !{i32 786689, metadata !238, metadata !"avec", metadata !5, i32 33557246, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [avec] [line 2814]
+!244 = metadata !{i32 786689, metadata !238, metadata !"bvec", metadata !5, i32 50334462, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [bvec] [line 2814]
+!245 = metadata !{i32 786688, metadata !238, metadata !"i", metadata !5, i32 2823, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2823]
+!246 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"binvcrhs", metadata !"binvcrhs", metadata !"", i32 2885, metadata !247, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !249, i32 2885} ; [ DW_TAG_subprogram ] [line 2885] [local] [def] [binvcrhs]
+!247 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !248, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!248 = metadata !{null, metadata !224, metadata !224, metadata !105}
+!249 = metadata !{metadata !250, metadata !251, metadata !252, metadata !253, metadata !254}
+!250 = metadata !{i32 786689, metadata !246, metadata !"lhs", metadata !5, i32 16780101, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [lhs] [line 2885]
+!251 = metadata !{i32 786689, metadata !246, metadata !"c", metadata !5, i32 33557317, metadata !224, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [c] [line 2885]
+!252 = metadata !{i32 786689, metadata !246, metadata !"r", metadata !5, i32 50334533, metadata !105, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [r] [line 2885]
+!253 = metadata !{i32 786688, metadata !246, metadata !"pivot", metadata !5, i32 2890, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [pivot] [line 2890]
+!254 = metadata !{i32 786688, metadata !246, metadata !"coeff", metadata !5, i32 2890, metadata !20, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [coeff] [line 2890]
+!255 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsz", metadata !"lhsz", metadata !"", i32 1475, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !256, i32 1475} ; [ DW_TAG_subprogram ] [line 1475] [local] [def] [lhsz]
+!256 = metadata !{metadata !257, metadata !258, metadata !259}
+!257 = metadata !{i32 786688, metadata !255, metadata !"i", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1484]
+!258 = metadata !{i32 786688, metadata !255, metadata !"j", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1484]
+!259 = metadata !{i32 786688, metadata !255, metadata !"k", metadata !5, i32 1484, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1484]
+!260 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_solve", metadata !"y_solve", metadata !"", i32 3299, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 3299} ; [ DW_TAG_subprogram ] [line 3299] [local] [def] [y_solve]
+!261 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_backsubstitute", metadata !"y_backsubstitute", metadata !"", i32 3323, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !262, i32 3323} ; [ DW_TAG_subprogram ] [line 3323] [local] [def] [y_backsubstitute]
+!262 = metadata !{metadata !263, metadata !264, metadata !265, metadata !266, metadata !267}
+!263 = metadata !{i32 786688, metadata !261, metadata !"i", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3335]
+!264 = metadata !{i32 786688, metadata !261, metadata !"j", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3335]
+!265 = metadata !{i32 786688, metadata !261, metadata !"k", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3335]
+!266 = metadata !{i32 786688, metadata !261, metadata !"m", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 3335]
+!267 = metadata !{i32 786688, metadata !261, metadata !"n", metadata !5, i32 3335, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 3335]
+!268 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"y_solve_cell", metadata !"y_solve_cell", metadata !"", i32 3355, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !269, i32 3355} ; [ DW_TAG_subprogram ] [line 3355] [local] [def] [y_solve_cell]
+!269 = metadata !{metadata !270, metadata !271, metadata !272, metadata !273}
+!270 = metadata !{i32 786688, metadata !268, metadata !"i", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 3370]
+!271 = metadata !{i32 786688, metadata !268, metadata !"j", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 3370]
+!272 = metadata !{i32 786688, metadata !268, metadata !"k", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 3370]
+!273 = metadata !{i32 786688, metadata !268, metadata !"jsize", metadata !5, i32 3370, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [jsize] [line 3370]
+!274 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsy", metadata !"lhsy", metadata !"", i32 1181, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !275, i32 1181} ; [ DW_TAG_subprogram ] [line 1181] [local] [def] [lhsy]
+!275 = metadata !{metadata !276, metadata !277, metadata !278}
+!276 = metadata !{i32 786688, metadata !274, metadata !"i", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 1190]
+!277 = metadata !{i32 786688, metadata !274, metadata !"j", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 1190]
+!278 = metadata !{i32 786688, metadata !274, metadata !"k", metadata !5, i32 1190, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 1190]
+!279 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_solve", metadata !"x_solve", metadata !"", i32 2658, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !2, i32 2658} ; [ DW_TAG_subprogram ] [line 2658] [local] [def] [x_solve]
+!280 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_backsubstitute", metadata !"x_backsubstitute", metadata !"", i32 2684, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !281, i32 2684} ; [ DW_TAG_subprogram ] [line 2684] [local] [def] [x_backsubstitute]
+!281 = metadata !{metadata !282, metadata !283, metadata !284, metadata !285, metadata !286}
+!282 = metadata !{i32 786688, metadata !280, metadata !"i", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2696]
+!283 = metadata !{i32 786688, metadata !280, metadata !"j", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2696]
+!284 = metadata !{i32 786688, metadata !280, metadata !"k", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 2696]
+!285 = metadata !{i32 786688, metadata !280, metadata !"m", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [m] [line 2696]
+!286 = metadata !{i32 786688, metadata !280, metadata !"n", metadata !5, i32 2696, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [n] [line 2696]
+!287 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"x_solve_cell", metadata !"x_solve_cell", metadata !"", i32 2716, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !288, i32 2716} ; [ DW_TAG_subprogram ] [line 2716] [local] [def] [x_solve_cell]
+!288 = metadata !{metadata !289, metadata !290, metadata !291, metadata !292}
+!289 = metadata !{i32 786688, metadata !287, metadata !"i", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 2728]
+!290 = metadata !{i32 786688, metadata !287, metadata !"j", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 2728]
+!291 = metadata !{i32 786688, metadata !287, metadata !"k", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 2728]
+!292 = metadata !{i32 786688, metadata !287, metadata !"isize", metadata !5, i32 2728, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [isize] [line 2728]
+!293 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"lhsx", metadata !"lhsx", metadata !"", i32 898, metadata !115, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, metadata !294, i32 898} ; [ DW_TAG_subprogram ] [line 898] [local] [def] [lhsx]
+!294 = metadata !{metadata !295, metadata !296, metadata !297}
+!295 = metadata !{i32 786688, metadata !293, metadata !"i", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [i] [line 907]
+!296 = metadata !{i32 786688, metadata !293, metadata !"j", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [j] [line 907]
+!297 = metadata !{i32 786688, metadata !293, metadata !"k", metadata !5, i32 907, metadata !8, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [k] [line 907]
+!298 = metadata !{metadata !299, metadata !304, metadata !305, metadata !309, metadata !310, metadata !311, metadata !312, metadata !313, metadata !314, metadata !315, metadata !316, metadata !317, metadata !318, metadata !319, metadata !320, metadata !321, metadata !322, metadata !323, metadata !324, metadata !325, metadata !326, metadata !327, metadata !328, metadata !329, metadata !330, metadata !331, metadata !332, metadata !333, metadata !334, metadata !335, metadata !336, metadata !337, metadata !338, metadata !339, metadata !340, metadata !341, metadata !342, metadata !343, metadata !347, metadata !350, metadata !351, metadata !352, metadata !353, metadata !354, metadata !355, metadata !356, metadata !360, metadata !361, metadata !362, metadata !363, metadata !364, metadata !365, metadata !366, metadata !367, metadata !368, metadata !369, metadata !370, metadata !371, metadata !372, metadata !373, metadata !374, metadata !375, metadata !376, metadata !377, metadata !378, metadata !379, metadata !380, metadata !381, metadata !382, metadata !383, metadata !384, metadata !385, metadata !386, metadata !387, metadata !388, metadata !389, metadata !390, metadata !391, metadata !392, metadata !393, metadata !394, metadata !395, metadata !396, metadata !397, metadata !398, metadata !399, metadata !400, metadata !401, metadata !402, metadata !403, metadata !404, metadata !405, metadata !406, metadata !407, metadata !408, metadata !409, metadata !410, metadata !411, metadata !412, metadata !413, metadata !414, metadata !415, metadata !416, metadata !417, metadata !418, metadata !419, metadata !422, metadata !426, metadata !427, metadata !430, metadata !431, metadata !434, metadata !435, metadata !436, metadata !437}
+!299 = metadata !{i32 786484, i32 0, null, metadata !"grid_points", metadata !"grid_points", metadata !"", metadata !300, i32 28, metadata !302, i32 1, i32 1, [3 x i32]* @grid_points, null} ; [ DW_TAG_variable ] [grid_points] [line 28] [local] [def]
+!300 = metadata !{i32 786473, metadata !301}      ; [ DW_TAG_file_type ] [/home/hfinkel/src/NPB2.3-omp-C/BT/./header.h]
+!301 = metadata !{metadata !"./header.h", metadata !"/home/hfinkel/src/NPB2.3-omp-C/BT"}
+!302 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 96, i64 32, i32 0, i32 0, metadata !8, metadata !303, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 96, align 32, offset 0] [from int]
+!303 = metadata !{metadata !178}
+!304 = metadata !{i32 786484, i32 0, null, metadata !"dt", metadata !"dt", metadata !"", metadata !300, i32 35, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dt] [line 35] [local] [def]
+!305 = metadata !{i32 786484, i32 0, null, metadata !"rhs", metadata !"rhs", metadata !"", metadata !300, i32 68, metadata !306, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [rhs] [line 68] [local] [def]
+!306 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1385839040, i64 64, i32 0, i32 0, metadata !20, metadata !307, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1385839040, align 64, offset 0] [from double]
+!307 = metadata !{metadata !308, metadata !308, metadata !308, metadata !93}
+!308 = metadata !{i32 786465, i64 0, i64 163}     ; [ DW_TAG_subrange_type ] [0, 162]
+!309 = metadata !{i32 786484, i32 0, null, metadata !"zzcon5", metadata !"zzcon5", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon5] [line 42] [local] [def]
+!310 = metadata !{i32 786484, i32 0, null, metadata !"zzcon4", metadata !"zzcon4", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon4] [line 42] [local] [def]
+!311 = metadata !{i32 786484, i32 0, null, metadata !"zzcon3", metadata !"zzcon3", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon3] [line 42] [local] [def]
+!312 = metadata !{i32 786484, i32 0, null, metadata !"dz5tz1", metadata !"dz5tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz5tz1] [line 43] [local] [def]
+!313 = metadata !{i32 786484, i32 0, null, metadata !"dz4tz1", metadata !"dz4tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz4tz1] [line 43] [local] [def]
+!314 = metadata !{i32 786484, i32 0, null, metadata !"dz3tz1", metadata !"dz3tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz3tz1] [line 43] [local] [def]
+!315 = metadata !{i32 786484, i32 0, null, metadata !"zzcon2", metadata !"zzcon2", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon2] [line 42] [local] [def]
+!316 = metadata !{i32 786484, i32 0, null, metadata !"dz2tz1", metadata !"dz2tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz2tz1] [line 43] [local] [def]
+!317 = metadata !{i32 786484, i32 0, null, metadata !"tz2", metadata !"tz2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz2] [line 31] [local] [def]
+!318 = metadata !{i32 786484, i32 0, null, metadata !"dz1tz1", metadata !"dz1tz1", metadata !"", metadata !300, i32 43, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz1tz1] [line 43] [local] [def]
+!319 = metadata !{i32 786484, i32 0, null, metadata !"yycon5", metadata !"yycon5", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon5] [line 40] [local] [def]
+!320 = metadata !{i32 786484, i32 0, null, metadata !"yycon4", metadata !"yycon4", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon4] [line 40] [local] [def]
+!321 = metadata !{i32 786484, i32 0, null, metadata !"yycon3", metadata !"yycon3", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon3] [line 40] [local] [def]
+!322 = metadata !{i32 786484, i32 0, null, metadata !"dy5ty1", metadata !"dy5ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy5ty1] [line 41] [local] [def]
+!323 = metadata !{i32 786484, i32 0, null, metadata !"dy4ty1", metadata !"dy4ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy4ty1] [line 41] [local] [def]
+!324 = metadata !{i32 786484, i32 0, null, metadata !"dy3ty1", metadata !"dy3ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy3ty1] [line 41] [local] [def]
+!325 = metadata !{i32 786484, i32 0, null, metadata !"yycon2", metadata !"yycon2", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon2] [line 40] [local] [def]
+!326 = metadata !{i32 786484, i32 0, null, metadata !"dy2ty1", metadata !"dy2ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy2ty1] [line 41] [local] [def]
+!327 = metadata !{i32 786484, i32 0, null, metadata !"ty2", metadata !"ty2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty2] [line 31] [local] [def]
+!328 = metadata !{i32 786484, i32 0, null, metadata !"dy1ty1", metadata !"dy1ty1", metadata !"", metadata !300, i32 41, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy1ty1] [line 41] [local] [def]
+!329 = metadata !{i32 786484, i32 0, null, metadata !"dssp", metadata !"dssp", metadata !"", metadata !300, i32 35, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dssp] [line 35] [local] [def]
+!330 = metadata !{i32 786484, i32 0, null, metadata !"c1", metadata !"c1", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1] [line 45] [local] [def]
+!331 = metadata !{i32 786484, i32 0, null, metadata !"xxcon5", metadata !"xxcon5", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon5] [line 38] [local] [def]
+!332 = metadata !{i32 786484, i32 0, null, metadata !"xxcon4", metadata !"xxcon4", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon4] [line 38] [local] [def]
+!333 = metadata !{i32 786484, i32 0, null, metadata !"xxcon3", metadata !"xxcon3", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon3] [line 38] [local] [def]
+!334 = metadata !{i32 786484, i32 0, null, metadata !"dx5tx1", metadata !"dx5tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx5tx1] [line 39] [local] [def]
+!335 = metadata !{i32 786484, i32 0, null, metadata !"dx4tx1", metadata !"dx4tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx4tx1] [line 39] [local] [def]
+!336 = metadata !{i32 786484, i32 0, null, metadata !"dx3tx1", metadata !"dx3tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx3tx1] [line 39] [local] [def]
+!337 = metadata !{i32 786484, i32 0, null, metadata !"c2", metadata !"c2", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2] [line 45] [local] [def]
+!338 = metadata !{i32 786484, i32 0, null, metadata !"con43", metadata !"con43", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [con43] [line 48] [local] [def]
+!339 = metadata !{i32 786484, i32 0, null, metadata !"xxcon2", metadata !"xxcon2", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon2] [line 38] [local] [def]
+!340 = metadata !{i32 786484, i32 0, null, metadata !"dx2tx1", metadata !"dx2tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx2tx1] [line 39] [local] [def]
+!341 = metadata !{i32 786484, i32 0, null, metadata !"tx2", metadata !"tx2", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx2] [line 31] [local] [def]
+!342 = metadata !{i32 786484, i32 0, null, metadata !"dx1tx1", metadata !"dx1tx1", metadata !"", metadata !300, i32 39, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx1tx1] [line 39] [local] [def]
+!343 = metadata !{i32 786484, i32 0, null, metadata !"forcing", metadata !"forcing", metadata !"", metadata !300, i32 66, metadata !344, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [forcing] [line 66] [local] [def]
+!344 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 1663006848, i64 64, i32 0, i32 0, metadata !20, metadata !345, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 1663006848, align 64, offset 0] [from double]
+!345 = metadata !{metadata !308, metadata !308, metadata !308, metadata !346}
+!346 = metadata !{i32 786465, i64 0, i64 6}       ; [ DW_TAG_subrange_type ] [0, 5]
+!347 = metadata !{i32 786484, i32 0, null, metadata !"qs", metadata !"qs", metadata !"", metadata !300, i32 63, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [qs] [line 63] [local] [def]
+!348 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 277167808, i64 64, i32 0, i32 0, metadata !20, metadata !349, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 277167808, align 64, offset 0] [from double]
+!349 = metadata !{metadata !308, metadata !308, metadata !308}
+!350 = metadata !{i32 786484, i32 0, null, metadata !"square", metadata !"square", metadata !"", metadata !300, i32 65, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [square] [line 65] [local] [def]
+!351 = metadata !{i32 786484, i32 0, null, metadata !"ws", metadata !"ws", metadata !"", metadata !300, i32 62, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ws] [line 62] [local] [def]
+!352 = metadata !{i32 786484, i32 0, null, metadata !"vs", metadata !"vs", metadata !"", metadata !300, i32 61, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [vs] [line 61] [local] [def]
+!353 = metadata !{i32 786484, i32 0, null, metadata !"us", metadata !"us", metadata !"", metadata !300, i32 60, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [us] [line 60] [local] [def]
+!354 = metadata !{i32 786484, i32 0, null, metadata !"rho_i", metadata !"rho_i", metadata !"", metadata !300, i32 64, metadata !348, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [rho_i] [line 64] [local] [def]
+!355 = metadata !{i32 786484, i32 0, null, metadata !"u", metadata !"u", metadata !"", metadata !300, i32 67, metadata !306, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [u] [line 67] [local] [def]
+!356 = metadata !{i32 786484, i32 0, null, metadata !"ce", metadata !"ce", metadata !"", metadata !300, i32 36, metadata !357, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ce] [line 36] [local] [def]
+!357 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 4160, i64 64, i32 0, i32 0, metadata !20, metadata !358, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 4160, align 64, offset 0] [from double]
+!358 = metadata !{metadata !93, metadata !359}
+!359 = metadata !{i32 786465, i64 0, i64 13}      ; [ DW_TAG_subrange_type ] [0, 12]
+!360 = metadata !{i32 786484, i32 0, null, metadata !"dnzm1", metadata !"dnzm1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnzm1] [line 44] [local] [def]
+!361 = metadata !{i32 786484, i32 0, null, metadata !"dnym1", metadata !"dnym1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnym1] [line 44] [local] [def]
+!362 = metadata !{i32 786484, i32 0, null, metadata !"dnxm1", metadata !"dnxm1", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dnxm1] [line 44] [local] [def]
+!363 = metadata !{i32 786484, i32 0, null, metadata !"zzcon1", metadata !"zzcon1", metadata !"", metadata !300, i32 42, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [zzcon1] [line 42] [local] [def]
+!364 = metadata !{i32 786484, i32 0, null, metadata !"yycon1", metadata !"yycon1", metadata !"", metadata !300, i32 40, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [yycon1] [line 40] [local] [def]
+!365 = metadata !{i32 786484, i32 0, null, metadata !"xxcon1", metadata !"xxcon1", metadata !"", metadata !300, i32 38, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [xxcon1] [line 38] [local] [def]
+!366 = metadata !{i32 786484, i32 0, null, metadata !"con16", metadata !"con16", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [con16] [line 48] [local] [def]
+!367 = metadata !{i32 786484, i32 0, null, metadata !"c2iv", metadata !"c2iv", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2iv] [line 48] [local] [def]
+!368 = metadata !{i32 786484, i32 0, null, metadata !"c3c4tz3", metadata !"c3c4tz3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4tz3] [line 48] [local] [def]
+!369 = metadata !{i32 786484, i32 0, null, metadata !"c3c4ty3", metadata !"c3c4ty3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4ty3] [line 48] [local] [def]
+!370 = metadata !{i32 786484, i32 0, null, metadata !"c3c4tx3", metadata !"c3c4tx3", metadata !"", metadata !300, i32 48, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4tx3] [line 48] [local] [def]
+!371 = metadata !{i32 786484, i32 0, null, metadata !"comz6", metadata !"comz6", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz6] [line 47] [local] [def]
+!372 = metadata !{i32 786484, i32 0, null, metadata !"comz5", metadata !"comz5", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz5] [line 47] [local] [def]
+!373 = metadata !{i32 786484, i32 0, null, metadata !"comz4", metadata !"comz4", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz4] [line 47] [local] [def]
+!374 = metadata !{i32 786484, i32 0, null, metadata !"comz1", metadata !"comz1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [comz1] [line 47] [local] [def]
+!375 = metadata !{i32 786484, i32 0, null, metadata !"dtdssp", metadata !"dtdssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtdssp] [line 45] [local] [def]
+!376 = metadata !{i32 786484, i32 0, null, metadata !"c2dttz1", metadata !"c2dttz1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dttz1] [line 47] [local] [def]
+!377 = metadata !{i32 786484, i32 0, null, metadata !"c2dtty1", metadata !"c2dtty1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dtty1] [line 47] [local] [def]
+!378 = metadata !{i32 786484, i32 0, null, metadata !"c2dttx1", metadata !"c2dttx1", metadata !"", metadata !300, i32 47, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c2dttx1] [line 47] [local] [def]
+!379 = metadata !{i32 786484, i32 0, null, metadata !"dttz2", metadata !"dttz2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttz2] [line 46] [local] [def]
+!380 = metadata !{i32 786484, i32 0, null, metadata !"dttz1", metadata !"dttz1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttz1] [line 46] [local] [def]
+!381 = metadata !{i32 786484, i32 0, null, metadata !"dtty2", metadata !"dtty2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtty2] [line 46] [local] [def]
+!382 = metadata !{i32 786484, i32 0, null, metadata !"dtty1", metadata !"dtty1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dtty1] [line 46] [local] [def]
+!383 = metadata !{i32 786484, i32 0, null, metadata !"dttx2", metadata !"dttx2", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttx2] [line 46] [local] [def]
+!384 = metadata !{i32 786484, i32 0, null, metadata !"dttx1", metadata !"dttx1", metadata !"", metadata !300, i32 46, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dttx1] [line 46] [local] [def]
+!385 = metadata !{i32 786484, i32 0, null, metadata !"c5dssp", metadata !"c5dssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c5dssp] [line 45] [local] [def]
+!386 = metadata !{i32 786484, i32 0, null, metadata !"c4dssp", metadata !"c4dssp", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c4dssp] [line 45] [local] [def]
+!387 = metadata !{i32 786484, i32 0, null, metadata !"dzmax", metadata !"dzmax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dzmax] [line 37] [local] [def]
+!388 = metadata !{i32 786484, i32 0, null, metadata !"dymax", metadata !"dymax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dymax] [line 37] [local] [def]
+!389 = metadata !{i32 786484, i32 0, null, metadata !"dxmax", metadata !"dxmax", metadata !"", metadata !300, i32 37, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dxmax] [line 37] [local] [def]
+!390 = metadata !{i32 786484, i32 0, null, metadata !"dz5", metadata !"dz5", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz5] [line 34] [local] [def]
+!391 = metadata !{i32 786484, i32 0, null, metadata !"dz4", metadata !"dz4", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz4] [line 34] [local] [def]
+!392 = metadata !{i32 786484, i32 0, null, metadata !"dz3", metadata !"dz3", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz3] [line 34] [local] [def]
+!393 = metadata !{i32 786484, i32 0, null, metadata !"dz2", metadata !"dz2", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz2] [line 34] [local] [def]
+!394 = metadata !{i32 786484, i32 0, null, metadata !"dz1", metadata !"dz1", metadata !"", metadata !300, i32 34, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dz1] [line 34] [local] [def]
+!395 = metadata !{i32 786484, i32 0, null, metadata !"dy5", metadata !"dy5", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy5] [line 33] [local] [def]
+!396 = metadata !{i32 786484, i32 0, null, metadata !"dy4", metadata !"dy4", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy4] [line 33] [local] [def]
+!397 = metadata !{i32 786484, i32 0, null, metadata !"dy3", metadata !"dy3", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy3] [line 33] [local] [def]
+!398 = metadata !{i32 786484, i32 0, null, metadata !"dy2", metadata !"dy2", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy2] [line 33] [local] [def]
+!399 = metadata !{i32 786484, i32 0, null, metadata !"dy1", metadata !"dy1", metadata !"", metadata !300, i32 33, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dy1] [line 33] [local] [def]
+!400 = metadata !{i32 786484, i32 0, null, metadata !"dx5", metadata !"dx5", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx5] [line 32] [local] [def]
+!401 = metadata !{i32 786484, i32 0, null, metadata !"dx4", metadata !"dx4", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx4] [line 32] [local] [def]
+!402 = metadata !{i32 786484, i32 0, null, metadata !"dx3", metadata !"dx3", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx3] [line 32] [local] [def]
+!403 = metadata !{i32 786484, i32 0, null, metadata !"dx2", metadata !"dx2", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx2] [line 32] [local] [def]
+!404 = metadata !{i32 786484, i32 0, null, metadata !"dx1", metadata !"dx1", metadata !"", metadata !300, i32 32, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [dx1] [line 32] [local] [def]
+!405 = metadata !{i32 786484, i32 0, null, metadata !"tz3", metadata !"tz3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz3] [line 31] [local] [def]
+!406 = metadata !{i32 786484, i32 0, null, metadata !"tz1", metadata !"tz1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tz1] [line 31] [local] [def]
+!407 = metadata !{i32 786484, i32 0, null, metadata !"ty3", metadata !"ty3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty3] [line 31] [local] [def]
+!408 = metadata !{i32 786484, i32 0, null, metadata !"ty1", metadata !"ty1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ty1] [line 31] [local] [def]
+!409 = metadata !{i32 786484, i32 0, null, metadata !"tx3", metadata !"tx3", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx3] [line 31] [local] [def]
+!410 = metadata !{i32 786484, i32 0, null, metadata !"tx1", metadata !"tx1", metadata !"", metadata !300, i32 31, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tx1] [line 31] [local] [def]
+!411 = metadata !{i32 786484, i32 0, null, metadata !"conz1", metadata !"conz1", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [conz1] [line 45] [local] [def]
+!412 = metadata !{i32 786484, i32 0, null, metadata !"c1345", metadata !"c1345", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1345] [line 44] [local] [def]
+!413 = metadata !{i32 786484, i32 0, null, metadata !"c3c4", metadata !"c3c4", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3c4] [line 44] [local] [def]
+!414 = metadata !{i32 786484, i32 0, null, metadata !"c1c5", metadata !"c1c5", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1c5] [line 44] [local] [def]
+!415 = metadata !{i32 786484, i32 0, null, metadata !"c1c2", metadata !"c1c2", metadata !"", metadata !300, i32 44, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c1c2] [line 44] [local] [def]
+!416 = metadata !{i32 786484, i32 0, null, metadata !"c5", metadata !"c5", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c5] [line 45] [local] [def]
+!417 = metadata !{i32 786484, i32 0, null, metadata !"c4", metadata !"c4", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c4] [line 45] [local] [def]
+!418 = metadata !{i32 786484, i32 0, null, metadata !"c3", metadata !"c3", metadata !"", metadata !300, i32 45, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [c3] [line 45] [local] [def]
+!419 = metadata !{i32 786484, i32 0, null, metadata !"lhs", metadata !"lhs", metadata !"", metadata !300, i32 69, metadata !420, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [lhs] [line 69] [local] [def]
+!420 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 20787585600, i64 64, i32 0, i32 0, metadata !20, metadata !421, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 20787585600, align 64, offset 0] [from double]
+!421 = metadata !{metadata !308, metadata !308, metadata !308, metadata !178, metadata !93, metadata !93}
+!422 = metadata !{i32 786484, i32 0, null, metadata !"q", metadata !"q", metadata !"", metadata !300, i32 73, metadata !423, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [q] [line 73] [local] [def]
+!423 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 10368, i64 64, i32 0, i32 0, metadata !20, metadata !424, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 10368, align 64, offset 0] [from double]
+!424 = metadata !{metadata !425}
+!425 = metadata !{i32 786465, i64 0, i64 162}     ; [ DW_TAG_subrange_type ] [0, 161]
+!426 = metadata !{i32 786484, i32 0, null, metadata !"cuf", metadata !"cuf", metadata !"", metadata !300, i32 72, metadata !423, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [cuf] [line 72] [local] [def]
+!427 = metadata !{i32 786484, i32 0, null, metadata !"buf", metadata !"buf", metadata !"", metadata !300, i32 75, metadata !428, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [buf] [line 75] [local] [def]
+!428 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 51840, i64 64, i32 0, i32 0, metadata !20, metadata !429, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 51840, align 64, offset 0] [from double]
+!429 = metadata !{metadata !425, metadata !93}
+!430 = metadata !{i32 786484, i32 0, null, metadata !"ue", metadata !"ue", metadata !"", metadata !300, i32 74, metadata !428, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [ue] [line 74] [local] [def]
+!431 = metadata !{i32 786484, i32 0, null, metadata !"njac", metadata !"njac", metadata !"", metadata !300, i32 86, metadata !432, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [njac] [line 86] [local] [def]
+!432 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 6886684800, i64 64, i32 0, i32 0, metadata !20, metadata !433, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 6886684800, align 64, offset 0] [from double]
+!433 = metadata !{metadata !308, metadata !308, metadata !425, metadata !93, metadata !93}
+!434 = metadata !{i32 786484, i32 0, null, metadata !"fjac", metadata !"fjac", metadata !"", metadata !300, i32 84, metadata !432, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [fjac] [line 84] [local] [def]
+!435 = metadata !{i32 786484, i32 0, null, metadata !"tmp3", metadata !"tmp3", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp3] [line 88] [local] [def]
+!436 = metadata !{i32 786484, i32 0, null, metadata !"tmp2", metadata !"tmp2", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp2] [line 88] [local] [def]
+!437 = metadata !{i32 786484, i32 0, null, metadata !"tmp1", metadata !"tmp1", metadata !"", metadata !300, i32 88, metadata !20, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] [tmp1] [line 88] [local] [def]
+!438 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!439 = metadata !{i32 1898, i32 0, metadata !440, null}
+!440 = metadata !{i32 786443, metadata !1, metadata !114, i32 1898, i32 0, i32 107} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!441 = metadata !{i32 1913, i32 0, metadata !442, null}
+!442 = metadata !{i32 786443, metadata !1, metadata !114, i32 1913, i32 0, i32 115} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!443 = metadata !{i32 1923, i32 0, metadata !114, null}
+!444 = metadata !{metadata !"int", metadata !445}
+!445 = metadata !{metadata !"omnipotent char", metadata !446}
+!446 = metadata !{metadata !"Simple C/C++ TBAA"}
+!447 = metadata !{i32 1}
+!448 = metadata !{i32 1925, i32 0, metadata !449, null}
+!449 = metadata !{i32 786443, metadata !1, metadata !114, i32 1925, i32 0, i32 121} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!450 = metadata !{i32 1939, i32 0, metadata !451, null}
+!451 = metadata !{i32 786443, metadata !1, metadata !114, i32 1939, i32 0, i32 127} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!452 = metadata !{i32 1940, i32 0, metadata !453, null}
+!453 = metadata !{i32 786443, metadata !1, metadata !454, i32 1940, i32 0, i32 129} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!454 = metadata !{i32 786443, metadata !1, metadata !451, i32 1939, i32 0, i32 128} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!455 = metadata !{i32 1941, i32 0, metadata !456, null}
+!456 = metadata !{i32 786443, metadata !1, metadata !457, i32 1941, i32 0, i32 131} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!457 = metadata !{i32 786443, metadata !1, metadata !453, i32 1940, i32 0, i32 130} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!458 = metadata !{i32 2020, i32 0, metadata !459, null}
+!459 = metadata !{i32 786443, metadata !1, metadata !460, i32 2020, i32 0, i32 149} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!460 = metadata !{i32 786443, metadata !1, metadata !461, i32 2019, i32 0, i32 148} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!461 = metadata !{i32 786443, metadata !1, metadata !462, i32 2019, i32 0, i32 147} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!462 = metadata !{i32 786443, metadata !1, metadata !463, i32 2018, i32 0, i32 146} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!463 = metadata !{i32 786443, metadata !1, metadata !114, i32 2018, i32 0, i32 145} ; [ DW_TAG_lexical_block ] [/home/hfinkel/src/NPB2.3-omp-C/BT/bt.c]
+!464 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/PowerPC/pr17354.ll b/test/CodeGen/PowerPC/pr17354.ll
new file mode 100644
index 0000000..dca81b1
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr17354.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mcpu=pwr7 -relocation-model=pic <%s | FileCheck %s
+
+; Test that PR17354 is fixed.  We must generate a nop following even
+; local calls when generating code for shared libraries, to permit
+; TOC fixup.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.CS = type { i32 }
+
+@_ZL3glb = internal global [1 x %struct.CS] zeroinitializer, align 4
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @_GLOBAL__I_a }]
+
+define internal void @__cxx_global_var_init() section ".text.startup" {
+entry:
+  call void @_Z4funcv(%struct.CS* sret getelementptr inbounds ([1 x %struct.CS]* @_ZL3glb, i64 0, i64 0))
+  ret void
+}
+
+; CHECK-LABEL: __cxx_global_var_init:
+; CHECK: bl _Z4funcv
+; CHECK-NEXT: nop
+
+; Function Attrs: nounwind
+define void @_Z4funcv(%struct.CS* noalias sret %agg.result) #0 {
+entry:
+  %a_ = getelementptr inbounds %struct.CS* %agg.result, i32 0, i32 0
+  store i32 0, i32* %a_, align 4
+  ret void
+}
+
+define internal void @_GLOBAL__I_a() section ".text.startup" {
+entry:
+  call void @__cxx_global_var_init()
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/recipest.ll b/test/CodeGen/PowerPC/recipest.ll
index 38d7682..891e801 100644
--- a/test/CodeGen/PowerPC/recipest.ll
+++ b/test/CodeGen/PowerPC/recipest.ll
@@ -169,6 +169,7 @@ entry:
   ret double %r
 
 ; CHECK: @foo3
+; CHECK: fcmpu
 ; CHECK-DAG: frsqrte
 ; CHECK-DAG: fnmsub
 ; CHECK: fmul
@@ -195,6 +196,7 @@ entry:
   ret float %r
 
 ; CHECK: @goo3
+; CHECK: fcmpu
 ; CHECK-DAG: frsqrtes
 ; CHECK-DAG: fnmsubs
 ; CHECK: fmuls
@@ -217,7 +219,8 @@ entry:
 
 ; CHECK: @hoo3
 ; CHECK: vrsqrtefp
-; CHECK: vrefp
+; CHECK-DAG: vrefp
+; CHECK-DAG: vcmpeqfp
 
 ; CHECK-SAFE: @hoo3
 ; CHECK-SAFE-NOT: vrsqrtefp
diff --git a/test/CodeGen/PowerPC/reg-names.ll b/test/CodeGen/PowerPC/reg-names.ll
new file mode 100644
index 0000000..f8fa7e4
--- /dev/null
+++ b/test/CodeGen/PowerPC/reg-names.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -ppc-asm-full-reg-names < %s | FileCheck -check-prefix=CHECK-FN %s
+
+define i64 @test1(i64 %a, i64 %b) {
+; CHECK-LABEL: @test1
+; CHECK-FN-LABEL: @test1
+
+entry:
+  ret i64 %b
+
+; CHECK: mr 3, 4
+; CHECK-FN: mr r3, r4
+
+; CHECK: blr
+; CHECK-FN: blr
+}
+
diff --git a/test/CodeGen/PowerPC/reloc-align.ll b/test/CodeGen/PowerPC/reloc-align.ll
index bd5c4d6..13d6ada 100644
--- a/test/CodeGen/PowerPC/reloc-align.ll
+++ b/test/CodeGen/PowerPC/reloc-align.ll
@@ -31,4 +31,4 @@ entry:
   ret i32 %bf.cast
 }
 
-attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/rlwimi-and.ll b/test/CodeGen/PowerPC/rlwimi-and.ll
index e20a13f..7963249 100644
--- a/test/CodeGen/PowerPC/rlwimi-and.ll
+++ b/test/CodeGen/PowerPC/rlwimi-and.ll
@@ -28,12 +28,11 @@ codeRepl17:                                       ; preds = %codeRepl4
   store i16 %rvml38.sroa.0.0.insert.insert, i16* undef, align 2
   unreachable
 
+; FIXME: the SLWI could be folded into the RLWIMI to give a rotate of 8.
 ; CHECK: @test
-; CHECK-DAG: slwi [[R1:[0-9]+]],
-; CHECK-DAG: rlwinm [[R2:[0-9]+]],
-; CHECK-DAG: srawi [[R3:[0-9]+]], [[R1]]
-; CHECK-DAG: rlwinm [[R4:[0-9]+]], [[R3]], 0, 23, 23
-; CHECK: rlwimi [[R4]], [[R2]], 0,
+; CHECK-DAG: slwi [[R1:[0-9]+]], {{[0-9]+}}, 31
+; CHECK-DAG: rlwinm [[R2:[0-9]+]], {{[0-9]+}}, 0, 31, 31
+; CHECK: rlwimi [[R2]], [[R1]], 9, 23, 23
 
 codeRepl29:                                       ; preds = %codeRepl1
   unreachable
diff --git a/test/CodeGen/PowerPC/rounding-ops.ll b/test/CodeGen/PowerPC/rounding-ops.ll
index 2c02900..bf0a641 100644
--- a/test/CodeGen/PowerPC/rounding-ops.ll
+++ b/test/CodeGen/PowerPC/rounding-ops.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 | FileCheck %s
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -enable-unsafe-fp-math | FileCheck -check-prefix=CHECK-FM %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
@@ -9,9 +8,6 @@ define float @test1(float %x) nounwind  {
 
 ; CHECK-LABEL: test1:
 ; CHECK: frim 1, 1
-
-; CHECK-FM-LABEL: test1:
-; CHECK-FM: frim 1, 1
 }
 
 declare float @floorf(float) nounwind readnone
@@ -22,38 +18,29 @@ define double @test2(double %x) nounwind  {
 
 ; CHECK-LABEL: test2:
 ; CHECK: frim 1, 1
-
-; CHECK-FM-LABEL: test2:
-; CHECK-FM: frim 1, 1
 }
 
 declare double @floor(double) nounwind readnone
 
 define float @test3(float %x) nounwind  {
-  %call = tail call float @nearbyintf(float %x) nounwind readnone
+  %call = tail call float @roundf(float %x) nounwind readnone
   ret float %call
 
 ; CHECK-LABEL: test3:
-; CHECK-NOT: frin
-
-; CHECK-FM-LABEL: test3:
-; CHECK-FM: frin 1, 1
+; CHECK: frin 1, 1
 }
 
-declare float @nearbyintf(float) nounwind readnone
+declare float @roundf(float) nounwind readnone
 
 define double @test4(double %x) nounwind  {
-  %call = tail call double @nearbyint(double %x) nounwind readnone
+  %call = tail call double @round(double %x) nounwind readnone
   ret double %call
 
 ; CHECK-LABEL: test4:
-; CHECK-NOT: frin
-
-; CHECK-FM-LABEL: test4:
-; CHECK-FM: frin 1, 1
+; CHECK: frin 1, 1
 }
 
-declare double @nearbyint(double) nounwind readnone
+declare double @round(double) nounwind readnone
 
 define float @test5(float %x) nounwind  {
   %call = tail call float @ceilf(float %x) nounwind readnone
@@ -61,9 +48,6 @@ define float @test5(float %x) nounwind  {
 
 ; CHECK-LABEL: test5:
 ; CHECK: frip 1, 1
-
-; CHECK-FM-LABEL: test5:
-; CHECK-FM: frip 1, 1
 }
 
 declare float @ceilf(float) nounwind readnone
@@ -74,9 +58,6 @@ define double @test6(double %x) nounwind  {
 
 ; CHECK-LABEL: test6:
 ; CHECK: frip 1, 1
-
-; CHECK-FM-LABEL: test6:
-; CHECK-FM: frip 1, 1
 }
 
 declare double @ceil(double) nounwind readnone
@@ -87,9 +68,6 @@ define float @test9(float %x) nounwind  {
 
 ; CHECK-LABEL: test9:
 ; CHECK: friz 1, 1
-
-; CHECK-FM-LABEL: test9:
-; CHECK-FM: friz 1, 1
 }
 
 declare float @truncf(float) nounwind readnone
@@ -100,48 +78,7 @@ define double @test10(double %x) nounwind  {
 
 ; CHECK-LABEL: test10:
 ; CHECK: friz 1, 1
-
-; CHECK-FM-LABEL: test10:
-; CHECK-FM: friz 1, 1
 }
 
 declare double @trunc(double) nounwind readnone
 
-define void @test11(float %x, float* %y) nounwind  {
-  %call = tail call float @rintf(float %x) nounwind readnone
-  store float %call, float* %y
-  ret void
-
-; CHECK-LABEL: test11:
-; CHECK-NOT: frin
-
-; CHECK-FM-LABEL: test11:
-; CHECK-FM: frin [[R2:[0-9]+]], [[R1:[0-9]+]]
-; CHECK-FM: fcmpu [[CR:[0-9]+]], [[R2]], [[R1]]
-; CHECK-FM: beq [[CR]], .LBB[[BB:[0-9]+]]_2
-; CHECK-FM: mtfsb1 6
-; CHECK-FM: .LBB[[BB]]_2:
-; CHECK-FM: blr
-}
-
-declare float @rintf(float) nounwind readnone
-
-define void @test12(double %x, double* %y) nounwind  {
-  %call = tail call double @rint(double %x) nounwind readnone
-  store double %call, double* %y
-  ret void
-
-; CHECK-LABEL: test12:
-; CHECK-NOT: frin
-
-; CHECK-FM-LABEL: test12:
-; CHECK-FM: frin [[R2:[0-9]+]], [[R1:[0-9]+]]
-; CHECK-FM: fcmpu [[CR:[0-9]+]], [[R2]], [[R1]]
-; CHECK-FM: beq [[CR]], .LBB[[BB:[0-9]+]]_2
-; CHECK-FM: mtfsb1 6
-; CHECK-FM: .LBB[[BB]]_2:
-; CHECK-FM: blr
-}
-
-declare double @rint(double) nounwind readnone
-
diff --git a/test/CodeGen/PowerPC/sjlj.ll b/test/CodeGen/PowerPC/sjlj.ll
index 571f3b2..414640b 100644
--- a/test/CodeGen/PowerPC/sjlj.ll
+++ b/test/CodeGen/PowerPC/sjlj.ll
@@ -64,15 +64,16 @@ return:                                           ; preds = %if.end, %if.then
 ; CHECK: std
 ; Make sure that we're not saving VRSAVE on non-Darwin:
 ; CHECK-NOT: mfspr
-; CHECK: stfd
-; CHECK: stvx
 
-; CHECK: addis [[REG:[0-9]+]], 2, env_sigill@toc@ha
-; CHECK: std 31, env_sigill@toc@l([[REG]])
-; CHECK: addi [[REG]], [[REG]], env_sigill@toc@l
-; CHECK: std [[REG]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
-; CHECK: std 1, 16([[REG]])
-; CHECK: std 2, 24([[REG]])
+; CHECK-DAG: stfd
+; CHECK-DAG: stvx
+
+; CHECK-DAG: addis [[REG:[0-9]+]], 2, env_sigill@toc@ha
+; CHECK-DAG: std 31, env_sigill@toc@l([[REG]])
+; CHECK-DAG: addi [[REGA:[0-9]+]], [[REG]], env_sigill@toc@l
+; CHECK-DAG: std [[REGA]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
+; CHECK-DAG: std 1, 16([[REGA]])
+; CHECK-DAG: std 2, 24([[REGA]])
 ; CHECK: bcl 20, 31, .LBB1_1
 ; CHECK: li 3, 1
 ; CHECK: #EH_SjLj_Setup	.LBB1_1
@@ -134,11 +135,11 @@ return:                                           ; preds = %if.end, %if.then
 
 ; CHECK: addis [[REG:[0-9]+]], 2, env_sigill@toc@ha
 ; CHECK: std 31, env_sigill@toc@l([[REG]])
-; CHECK: addi [[REG]], [[REG]], env_sigill@toc@l
-; CHECK: std [[REG]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
-; CHECK: std 1, 16([[REG]])
-; CHECK: std 2, 24([[REG]])
-; CHECK: std 30, 32([[REG]])
+; CHECK: addi [[REGB:[0-9]+]], [[REG]], env_sigill@toc@l
+; CHECK-DAG: std [[REGB]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
+; CHECK-DAG: std 1, 16([[REGB]])
+; CHECK-DAG: std 2, 24([[REGB]])
+; CHECK-DAG: std 30, 32([[REGB]])
 ; CHECK: bcl 20, 31,
 
 ; CHECK: blr
@@ -152,7 +153,7 @@ declare i8* @llvm.stacksave() #3
 
 declare i32 @llvm.eh.sjlj.setjmp(i8*) #3
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { noreturn nounwind }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/test/CodeGen/PowerPC/stack-realign.ll b/test/CodeGen/PowerPC/stack-realign.ll
index f7b6d19..1c7a36a 100644
--- a/test/CodeGen/PowerPC/stack-realign.ll
+++ b/test/CodeGen/PowerPC/stack-realign.ll
@@ -11,13 +11,13 @@ define void @goo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32], align 32
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [2 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 32, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 32
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx)
   ret void
 }
@@ -74,13 +74,13 @@ define void @hoo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [200000 x i32], align 32
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [200000 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 32, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 32
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [200000 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx)
   ret void
 }
@@ -105,13 +105,13 @@ define void @loo(%struct.s* byval nocapture readonly %a) {
 entry:
   %x = alloca [2 x i32], align 32
   %a1 = getelementptr inbounds %struct.s* %a, i64 0, i32 0
-  %0 = load i32* %a1, align 4, !tbaa !0
+  %0 = load i32* %a1, align 4
   %arrayidx = getelementptr inbounds [2 x i32]* %x, i64 0, i64 0
-  store i32 %0, i32* %arrayidx, align 32, !tbaa !0
+  store i32 %0, i32* %arrayidx, align 32
   %b = getelementptr inbounds %struct.s* %a, i64 0, i32 1
-  %1 = load i32* %b, align 4, !tbaa !0
+  %1 = load i32* %b, align 4
   %arrayidx2 = getelementptr inbounds [2 x i32]* %x, i64 0, i64 1
-  store i32 %1, i32* %arrayidx2, align 4, !tbaa !0
+  store i32 %1, i32* %arrayidx2, align 4
   call void @bar(i32* %arrayidx)
   call void asm sideeffect "", "~{f30}"() nounwind
   ret void
@@ -145,7 +145,3 @@ entry:
 ; CHECK-FP: stfd 30, -16(30)
 
 ; CHECK-FP: blr
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/PowerPC/subsumes-pred-regs.ll b/test/CodeGen/PowerPC/subsumes-pred-regs.ll
new file mode 100644
index 0000000..97ac788
--- /dev/null
+++ b/test/CodeGen/PowerPC/subsumes-pred-regs.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mcpu=ppc64 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define zeroext i1 @test1() unnamed_addr #0 align 2 {
+
+; CHECK-LABEL: @test1
+
+entry:
+  br i1 undef, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %entry
+  unreachable
+
+lor.end:                                          ; preds = %entry
+  br i1 undef, label %land.rhs, label %if.then
+
+if.then:                                          ; preds = %lor.end
+  br i1 undef, label %return, label %if.end.i24
+
+if.end.i24:                                       ; preds = %if.then
+  %0 = load i32* undef, align 4
+  %lnot.i.i16.i23 = icmp eq i32 %0, 0
+  br i1 %lnot.i.i16.i23, label %if.end7.i37, label %test.exit27.i34
+
+test.exit27.i34: ; preds = %if.end.i24
+  br i1 undef, label %return, label %if.end7.i37
+
+if.end7.i37:                                      ; preds = %test.exit27.i34, %if.end.i24
+  %tobool.i.i36 = icmp eq i8 undef, 0
+  br i1 %tobool.i.i36, label %return, label %if.then9.i39
+
+if.then9.i39:                                     ; preds = %if.end7.i37
+  br i1 %lnot.i.i16.i23, label %return, label %lor.rhs.i.i49
+
+; CHECK: .LBB0_7:
+; CHECK:	beq 1, .LBB0_10
+; CHECK:	beq 0, .LBB0_10
+; CHECK: .LBB0_9:
+
+lor.rhs.i.i49:                                    ; preds = %if.then9.i39
+  %cmp.i.i.i.i48 = icmp ne i64 undef, 0
+  br label %return
+
+land.rhs:                                         ; preds = %lor.end
+  br i1 undef, label %return, label %if.end.i
+
+if.end.i:                                         ; preds = %land.rhs
+  br i1 undef, label %return, label %if.then9.i
+
+if.then9.i:                                       ; preds = %if.end.i
+  br i1 undef, label %return, label %lor.rhs.i.i
+
+lor.rhs.i.i:                                      ; preds = %if.then9.i
+  %cmp.i.i.i.i = icmp ne i64 undef, 0
+  br label %return
+
+return:                                           ; preds = %lor.rhs.i.i, %if.then9.i, %if.end.i, %land.rhs, %lor.rhs.i.i49, %if.then9.i39, %if.end7.i37, %test.exit27.i34, %if.then
+  %retval.0 = phi i1 [ false, %if.then ], [ false, %test.exit27.i34 ], [ true, %if.end7.i37 ], [ true, %if.then9.i39 ], [ %cmp.i.i.i.i48, %lor.rhs.i.i49 ], [ false, %land.rhs ], [ true, %if.end.i ], [ true, %if.then9.i ], [ %cmp.i.i.i.i, %lor.rhs.i.i ]
+  ret i1 %retval.0
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/tls-gd-obj.ll b/test/CodeGen/PowerPC/tls-gd-obj.ll
deleted file mode 100644
index 26cb6f2..0000000
--- a/test/CodeGen/PowerPC/tls-gd-obj.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc -mcpu=pwr7 -O0 -filetype=obj -relocation-model=pic %s -o - | \
-; RUN: llvm-readobj -r | FileCheck %s
-
-; Test correct relocation generation for thread-local storage using
-; the general dynamic model and integrated assembly.
-
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-@a = thread_local global i32 0, align 4
-
-define signext i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @a, align 4
-  ret i32 %0
-}
-
-; Verify generation of R_PPC64_GOT_TLSGD16_HA, R_PPC64_GOT_TLSGD16_LO,
-; and R_PPC64_TLSGD for accessing external variable a, and R_PPC64_REL24
-; for the call to __tls_get_addr.
-;
-; CHECK: Relocations [
-; CHECK:   Section (2) .rela.text {
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSGD16_HA a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSGD16_LO a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TLSGD          a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_REL24          __tls_get_addr
-; CHECK:   }
-; CHECK: ]
diff --git a/test/CodeGen/PowerPC/tls-ie-obj.ll b/test/CodeGen/PowerPC/tls-ie-obj.ll
deleted file mode 100644
index f24a94b..0000000
--- a/test/CodeGen/PowerPC/tls-ie-obj.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; RUN: llc -mcpu=pwr7 -O0 -filetype=obj %s -o - | \
-; RUN: llvm-readobj -r | FileCheck %s
-
-; Test correct relocation generation for thread-local storage
-; using the initial-exec model and integrated assembly.
-
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-@a = external thread_local global i32
-
-define signext i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @a, align 4
-  ret i32 %0
-}
-
-; Verify generation of R_PPC64_GOT_TPREL16_DS and R_PPC64_TLS for
-; accessing external variable a.
-;
-; CHECK: Relocations [
-; CHECK:   Section (2) .rela.text {
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TPREL16_HA    a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TPREL16_LO_DS a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TLS               a
-; CHECK:   }
-; CHECK: ]
diff --git a/test/CodeGen/PowerPC/tls-ld-obj.ll b/test/CodeGen/PowerPC/tls-ld-obj.ll
deleted file mode 100644
index 4a7d7b3..0000000
--- a/test/CodeGen/PowerPC/tls-ld-obj.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc -mcpu=pwr7 -O0 -filetype=obj -relocation-model=pic %s -o - | \
-; RUN: llvm-readobj -r | FileCheck %s
-
-; Test correct relocation generation for thread-local storage using
-; the local dynamic model.
-
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-@a = hidden thread_local global i32 0, align 4
-
-define signext i32 @main() nounwind {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %0 = load i32* @a, align 4
-  ret i32 %0
-}
-
-; Verify generation of R_PPC64_GOT_TLSLD16_HA, R_PPC64_GOT_TLSLD16_LO,
-; R_PPC64_TLSLD, R_PPC64_DTPREL16_HA, and R_PPC64_DTPREL16_LO for
-; accessing external variable a, and R_PPC64_REL24 for the call to
-; __tls_get_addr.
-;
-; CHECK: Relocations [
-; CHECK:   Section (2) .rela.text {
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSLD16_HA a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_GOT_TLSLD16_LO a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_TLSLD          a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_REL24          __tls_get_addr
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_DTPREL16_HA    a
-; CHECK:     0x{{[0-9,A-F]+}} R_PPC64_DTPREL16_LO    a
-; CHECK:   }
-; CHECK: ]
diff --git a/test/CodeGen/PowerPC/unal-altivec2.ll b/test/CodeGen/PowerPC/unal-altivec2.ll
new file mode 100644
index 0000000..7464675
--- /dev/null
+++ b/test/CodeGen/PowerPC/unal-altivec2.ll
@@ -0,0 +1,166 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(float* noalias nocapture %x, float* noalias nocapture readonly %y) #0 {
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+; CHECK-LABEL: @foo
+; CHECK: lvsl
+; CHECK: blr
+  %index = phi i64 [ 0, %entry ], [ %index.next.15, %vector.body ]
+  %0 = getelementptr inbounds float* %y, i64 %index
+  %1 = bitcast float* %0 to <4 x float>*
+  %wide.load = load <4 x float>* %1, align 4
+  %2 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load)
+  %3 = getelementptr inbounds float* %x, i64 %index
+  %4 = bitcast float* %3 to <4 x float>*
+  store <4 x float> %2, <4 x float>* %4, align 4
+  %index.next = add i64 %index, 4
+  %5 = getelementptr inbounds float* %y, i64 %index.next
+  %6 = bitcast float* %5 to <4 x float>*
+  %wide.load.1 = load <4 x float>* %6, align 4
+  %7 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.1)
+  %8 = getelementptr inbounds float* %x, i64 %index.next
+  %9 = bitcast float* %8 to <4 x float>*
+  store <4 x float> %7, <4 x float>* %9, align 4
+  %index.next.1 = add i64 %index.next, 4
+  %10 = getelementptr inbounds float* %y, i64 %index.next.1
+  %11 = bitcast float* %10 to <4 x float>*
+  %wide.load.2 = load <4 x float>* %11, align 4
+  %12 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.2)
+  %13 = getelementptr inbounds float* %x, i64 %index.next.1
+  %14 = bitcast float* %13 to <4 x float>*
+  store <4 x float> %12, <4 x float>* %14, align 4
+  %index.next.2 = add i64 %index.next.1, 4
+  %15 = getelementptr inbounds float* %y, i64 %index.next.2
+  %16 = bitcast float* %15 to <4 x float>*
+  %wide.load.3 = load <4 x float>* %16, align 4
+  %17 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.3)
+  %18 = getelementptr inbounds float* %x, i64 %index.next.2
+  %19 = bitcast float* %18 to <4 x float>*
+  store <4 x float> %17, <4 x float>* %19, align 4
+  %index.next.3 = add i64 %index.next.2, 4
+  %20 = getelementptr inbounds float* %y, i64 %index.next.3
+  %21 = bitcast float* %20 to <4 x float>*
+  %wide.load.4 = load <4 x float>* %21, align 4
+  %22 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.4)
+  %23 = getelementptr inbounds float* %x, i64 %index.next.3
+  %24 = bitcast float* %23 to <4 x float>*
+  store <4 x float> %22, <4 x float>* %24, align 4
+  %index.next.4 = add i64 %index.next.3, 4
+  %25 = getelementptr inbounds float* %y, i64 %index.next.4
+  %26 = bitcast float* %25 to <4 x float>*
+  %wide.load.5 = load <4 x float>* %26, align 4
+  %27 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.5)
+  %28 = getelementptr inbounds float* %x, i64 %index.next.4
+  %29 = bitcast float* %28 to <4 x float>*
+  store <4 x float> %27, <4 x float>* %29, align 4
+  %index.next.5 = add i64 %index.next.4, 4
+  %30 = getelementptr inbounds float* %y, i64 %index.next.5
+  %31 = bitcast float* %30 to <4 x float>*
+  %wide.load.6 = load <4 x float>* %31, align 4
+  %32 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.6)
+  %33 = getelementptr inbounds float* %x, i64 %index.next.5
+  %34 = bitcast float* %33 to <4 x float>*
+  store <4 x float> %32, <4 x float>* %34, align 4
+  %index.next.6 = add i64 %index.next.5, 4
+  %35 = getelementptr inbounds float* %y, i64 %index.next.6
+  %36 = bitcast float* %35 to <4 x float>*
+  %wide.load.7 = load <4 x float>* %36, align 4
+  %37 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.7)
+  %38 = getelementptr inbounds float* %x, i64 %index.next.6
+  %39 = bitcast float* %38 to <4 x float>*
+  store <4 x float> %37, <4 x float>* %39, align 4
+  %index.next.7 = add i64 %index.next.6, 4
+  %40 = getelementptr inbounds float* %y, i64 %index.next.7
+  %41 = bitcast float* %40 to <4 x float>*
+  %wide.load.8 = load <4 x float>* %41, align 4
+  %42 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.8)
+  %43 = getelementptr inbounds float* %x, i64 %index.next.7
+  %44 = bitcast float* %43 to <4 x float>*
+  store <4 x float> %42, <4 x float>* %44, align 4
+  %index.next.8 = add i64 %index.next.7, 4
+  %45 = getelementptr inbounds float* %y, i64 %index.next.8
+  %46 = bitcast float* %45 to <4 x float>*
+  %wide.load.9 = load <4 x float>* %46, align 4
+  %47 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.9)
+  %48 = getelementptr inbounds float* %x, i64 %index.next.8
+  %49 = bitcast float* %48 to <4 x float>*
+  store <4 x float> %47, <4 x float>* %49, align 4
+  %index.next.9 = add i64 %index.next.8, 4
+  %50 = getelementptr inbounds float* %y, i64 %index.next.9
+  %51 = bitcast float* %50 to <4 x float>*
+  %wide.load.10 = load <4 x float>* %51, align 4
+  %52 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.10)
+  %53 = getelementptr inbounds float* %x, i64 %index.next.9
+  %54 = bitcast float* %53 to <4 x float>*
+  store <4 x float> %52, <4 x float>* %54, align 4
+  %index.next.10 = add i64 %index.next.9, 4
+  %55 = getelementptr inbounds float* %y, i64 %index.next.10
+  %56 = bitcast float* %55 to <4 x float>*
+  %wide.load.11 = load <4 x float>* %56, align 4
+  %57 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.11)
+  %58 = getelementptr inbounds float* %x, i64 %index.next.10
+  %59 = bitcast float* %58 to <4 x float>*
+  store <4 x float> %57, <4 x float>* %59, align 4
+  %index.next.11 = add i64 %index.next.10, 4
+  %60 = getelementptr inbounds float* %y, i64 %index.next.11
+  %61 = bitcast float* %60 to <4 x float>*
+  %wide.load.12 = load <4 x float>* %61, align 4
+  %62 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.12)
+  %63 = getelementptr inbounds float* %x, i64 %index.next.11
+  %64 = bitcast float* %63 to <4 x float>*
+  store <4 x float> %62, <4 x float>* %64, align 4
+  %index.next.12 = add i64 %index.next.11, 4
+  %65 = getelementptr inbounds float* %y, i64 %index.next.12
+  %66 = bitcast float* %65 to <4 x float>*
+  %wide.load.13 = load <4 x float>* %66, align 4
+  %67 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.13)
+  %68 = getelementptr inbounds float* %x, i64 %index.next.12
+  %69 = bitcast float* %68 to <4 x float>*
+  store <4 x float> %67, <4 x float>* %69, align 4
+  %index.next.13 = add i64 %index.next.12, 4
+  %70 = getelementptr inbounds float* %y, i64 %index.next.13
+  %71 = bitcast float* %70 to <4 x float>*
+  %wide.load.14 = load <4 x float>* %71, align 4
+  %72 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.14)
+  %73 = getelementptr inbounds float* %x, i64 %index.next.13
+  %74 = bitcast float* %73 to <4 x float>*
+  store <4 x float> %72, <4 x float>* %74, align 4
+  %index.next.14 = add i64 %index.next.13, 4
+  %75 = getelementptr inbounds float* %y, i64 %index.next.14
+  %76 = bitcast float* %75 to <4 x float>*
+  %wide.load.15 = load <4 x float>* %76, align 4
+  %77 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.15)
+  %78 = getelementptr inbounds float* %x, i64 %index.next.14
+  %79 = bitcast float* %78 to <4 x float>*
+  store <4 x float> %77, <4 x float>* %79, align 4
+  %index.next.15 = add i64 %index.next.14, 4
+  %80 = icmp eq i64 %index.next.15, 2048
+  br i1 %80, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm_cos_v4f32(<4 x float>) #1
+
+define <2 x double> @bar(double* %x) {
+entry:
+  %p = bitcast double* %x to <2 x double>*
+  %r = load <2 x double>* %p, align 8
+
+; CHECK-LABEL: @bar
+; CHECK-NOT: lvsl
+; CHECK: blr
+
+  ret <2 x double> %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/PowerPC/unal4-std.ll b/test/CodeGen/PowerPC/unal4-std.ll
index 169bd78..9f29e31 100644
--- a/test/CodeGen/PowerPC/unal4-std.ll
+++ b/test/CodeGen/PowerPC/unal4-std.ll
@@ -24,4 +24,4 @@ if.end210:                                        ; preds = %entry
 ; CHECK: stdx {{[0-9]+}}, 0,
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/PowerPC/unwind-dw2-g.ll b/test/CodeGen/PowerPC/unwind-dw2-g.ll
index 2baac76..260d036 100644
--- a/test/CodeGen/PowerPC/unwind-dw2-g.ll
+++ b/test/CodeGen/PowerPC/unwind-dw2-g.ll
@@ -19,7 +19,7 @@ declare void @llvm.eh.unwind.init() #0
 attributes #0 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8}
+!llvm.module.flags = !{!8, !11}
 
 !0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"clang version 3.4", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [/tmp/unwind-dw2.c] [DW_LANG_C99]
 !1 = metadata !{metadata !"/tmp/unwind-dw2.c", metadata !"/tmp"}
@@ -27,8 +27,9 @@ attributes #0 = { nounwind }
 !3 = metadata !{metadata !4}
 !4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, void ()* @foo, null, null, metadata !2, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [foo]
 !5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [/tmp/unwind-dw2.c]
-!6 = metadata !{i32 786453, i32 0, i32 0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null}
 !8 = metadata !{i32 2, metadata !"Dwarf Version", i32 3}
 !9 = metadata !{i32 2, i32 0, metadata !4, null}
 !10 = metadata !{i32 3, i32 0, metadata !4, null}
+!11 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/PowerPC/vec-abi-align.ll b/test/CodeGen/PowerPC/vec-abi-align.ll
new file mode 100644
index 0000000..3239cf6
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec-abi-align.ll
@@ -0,0 +1,60 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.s2 = type { i64, <4 x float> }
+
+@ve = external global <4 x float>
+@n = external global i64
+
+; Function Attrs: nounwind
+define void @test1(i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, <4 x float> inreg %vs.coerce) #0 {
+entry:
+  store <4 x float> %vs.coerce, <4 x float>* @ve, align 16
+  ret void
+
+; CHECK-LABEL: @test1
+; CHECK: stvx 2,
+; CHECK: blr
+}
+
+; Function Attrs: nounwind
+define void @test2(i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, %struct.s2* byval nocapture readonly %vs) #0 {
+entry:
+  %m = getelementptr inbounds %struct.s2* %vs, i64 0, i32 0
+  %0 = load i64* %m, align 8
+  store i64 %0, i64* @n, align 8
+  %v = getelementptr inbounds %struct.s2* %vs, i64 0, i32 1
+  %1 = load <4 x float>* %v, align 16
+  store <4 x float> %1, <4 x float>* @ve, align 16
+  ret void
+
+; CHECK-LABEL: @test2
+; CHECK: ld {{[0-9]+}}, 112(1)
+; CHECK: li [[REG16:[0-9]+]], 16
+; CHECK: addi [[REGB:[0-9]+]], 1, 112
+; CHECK: lvx 2, [[REGB]], [[REG16]]
+; CHECK: blr
+}
+
+; Function Attrs: nounwind
+define void @test3(i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, %struct.s2* byval nocapture readonly %vs) #0 {
+entry:
+  %m = getelementptr inbounds %struct.s2* %vs, i64 0, i32 0
+  %0 = load i64* %m, align 8
+  store i64 %0, i64* @n, align 8
+  %v = getelementptr inbounds %struct.s2* %vs, i64 0, i32 1
+  %1 = load <4 x float>* %v, align 16
+  store <4 x float> %1, <4 x float>* @ve, align 16
+  ret void
+
+; CHECK-LABEL: @test3
+; CHECK: ld {{[0-9]+}}, 128(1)
+; CHECK: li [[REG16:[0-9]+]], 16
+; CHECK: addi [[REGB:[0-9]+]], 1, 128
+; CHECK: lvx 2, [[REGB]], [[REG16]]
+; CHECK: blr
+}
+
+attributes #0 = { nounwind }
+
diff --git a/test/CodeGen/PowerPC/vec_extload.ll b/test/CodeGen/PowerPC/vec_extload.ll
index 6373a26..8d16e15 100644
--- a/test/CodeGen/PowerPC/vec_extload.ll
+++ b/test/CodeGen/PowerPC/vec_extload.ll
@@ -5,7 +5,7 @@
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
-; Altivec does not provides an sext intruction, so it expands
+; Altivec does not provides an sext instruction, so it expands
 ; a set of vector stores (stvx), bytes load/sign expand/store
 ; (lbz/stb), and a final vector load (lvx) to load the result
 ; extended vector.
diff --git a/test/CodeGen/PowerPC/zero-not-run.ll b/test/CodeGen/PowerPC/zero-not-run.ll
index 04c4277..9df0d6e 100644
--- a/test/CodeGen/PowerPC/zero-not-run.ll
+++ b/test/CodeGen/PowerPC/zero-not-run.ll
@@ -24,4 +24,4 @@ for.end731:                                       ; preds = %entry
 ; Function Attrs: nounwind
 declare i64 @safe_mod_func_uint64_t_u_u(i64, i64) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/R600/128bit-kernel-args.ll b/test/CodeGen/R600/128bit-kernel-args.ll
index 5c14270..3c4fcf7 100644
--- a/test/CodeGen/R600/128bit-kernel-args.ll
+++ b/test/CodeGen/R600/128bit-kernel-args.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK: @v4i32_kernel_arg
 ; R600-CHECK-DAG: MOV {{[* ]*}}T[[GPR:[0-9]]].X, KC0[3].Y
diff --git a/test/CodeGen/R600/32-bit-local-address-space.ll b/test/CodeGen/R600/32-bit-local-address-space.ll
new file mode 100644
index 0000000..7a12687
--- /dev/null
+++ b/test/CodeGen/R600/32-bit-local-address-space.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
+; the global address space(1) uses 64-bit pointers.  These tests check to make sure
+; the correct pointer size is used for the local address space.
+
+; The e{{32|64}} suffix on the instructions refers to the encoding size and not
+; the size of the operands.  The operand size is denoted in the instruction name.
+; Instructions with B32, U32, and I32 in their name take 32-bit operands, while
+; instructions with B64, U64, and I64 take 64-bit operands.
+
+; CHECK-LABEL: @local_address_load
+; CHECK: V_MOV_B32_e{{32|64}} [[PTR:v[0-9]]]
+; CHECK: DS_READ_B32 [[PTR]]
+define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = load i32 addrspace(3)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @local_address_gep
+; CHECK: S_ADD_I32 [[SPTR:s[0-9]]]
+; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; CHECK: DS_READ_B32 [[VPTR]]
+define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
+entry:
+  %0 = getelementptr i32 addrspace(3)* %in, i32 %offset
+  %1 = load i32 addrspace(3)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @local_address_gep_const_offset
+; CHECK: S_ADD_I32 [[SPTR:s[0-9]]]
+; CHECK: V_MOV_B32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
+; CHECK: DS_READ_B32 [[VPTR]]
+define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = getelementptr i32 addrspace(3)* %in, i32 1
+  %1 = load i32 addrspace(3)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @null_32bit_lds_ptr:
+; CHECK: V_CMP_NE_I32
+; CHECK-NOT: V_CMP_NE_I32
+; CHECK: V_CNDMASK_B32
+define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
+  %cmp = icmp ne i32 addrspace(3)* %lds, null
+  %x = select i1 %cmp, i32 123, i32 456
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @mul_32bit_ptr:
+; CHECK: V_MUL_LO_I32
+; CHECK-NEXT: V_ADD_I32_e32
+; CHECK-NEXT: DS_READ_B32
+define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
+  %ptr = getelementptr [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
+  %val = load float addrspace(3)* %ptr
+  store float %val, float addrspace(1)* %out
+  ret void
+}
+
+@g_lds = addrspace(3) global float zeroinitializer, align 4
+
+; CHECK-LABEL: @infer_ptr_alignment_global_offset:
+; CHECK: V_MOV_B32_e32 [[REG:v[0-9]+]], 0
+; CHECK: DS_READ_B32 v{{[0-9]+}}, 0, [[REG]]
+define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
+  %val = load float addrspace(3)* @g_lds
+  store float %val, float addrspace(1)* %out
+  ret void
+}
+
+
+@ptr = addrspace(3) global i32 addrspace(3)* null
+@dst = addrspace(3) global [16384 x i32] zeroinitializer
+
+; SI-LABEL: @global_ptr:
+; SI-CHECK: DS_WRITE_B32
+define void @global_ptr() nounwind {
+  store i32 addrspace(3)* getelementptr ([16384 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
+  ret void
+}
diff --git a/test/CodeGen/R600/64bit-kernel-args.ll b/test/CodeGen/R600/64bit-kernel-args.ll
index 34a0a87..0d6bfb1 100644
--- a/test/CodeGen/R600/64bit-kernel-args.ll
+++ b/test/CodeGen/R600/64bit-kernel-args.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; SI-CHECK: @f64_kernel_arg
-; SI-CHECK-DAG: S_LOAD_DWORDX2 SGPR{{[0-9]}}_SGPR{{[0-9]}}, SGPR0_SGPR1, 9
-; SI-CHECK-DAG: S_LOAD_DWORDX2 SGPR{{[0-9]}}_SGPR{{[0-9]}}, SGPR0_SGPR1, 11
+; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 9
+; SI-CHECK-DAG: S_LOAD_DWORDX2 s[{{[0-9]:[0-9]}}], s[0:1], 11
 ; SI-CHECK: BUFFER_STORE_DWORDX2
 define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
diff --git a/test/CodeGen/R600/add.ll b/test/CodeGen/R600/add.ll
index 16f7f97..3d5506b 100644
--- a/test/CodeGen/R600/add.ll
+++ b/test/CodeGen/R600/add.ll
@@ -1,39 +1,55 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @test2
+;EG-CHECK-LABEL: @test1:
+;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+;SI-CHECK-LABEL: @test1:
+;SI-CHECK: V_ADD_I32_e32 [[REG:v[0-9]+]], {{v[0-9]+, v[0-9]+}}
+;SI-CHECK-NOT: [[REG]]
+;SI-CHECK: BUFFER_STORE_DWORD [[REG]],
+define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1)* %in
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+;EG-CHECK-LABEL: @test2:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test2
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @test2:
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
-  %a = load <2 x i32> addrspace(1) * %in
-  %b = load <2 x i32> addrspace(1) * %b_ptr
+  %a = load <2 x i32> addrspace(1)* %in
+  %b = load <2 x i32> addrspace(1)* %b_ptr
   %result = add <2 x i32> %a, %b
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-;EG-CHECK: @test4
+;EG-CHECK-LABEL: @test4:
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @test4
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ADD_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @test4:
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ADD_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
-  %a = load <4 x i32> addrspace(1) * %in
-  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %a = load <4 x i32> addrspace(1)* %in
+  %b = load <4 x i32> addrspace(1)* %b_ptr
   %result = add <4 x i32> %a, %b
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/add_i64.ll b/test/CodeGen/R600/add_i64.ll
new file mode 100644
index 0000000..303a1cb
--- /dev/null
+++ b/test/CodeGen/R600/add_i64.ll
@@ -0,0 +1,59 @@
+; XFAIL: *
+; This will fail until i64 add is enabled
+
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI %s
+
+
+declare i32 @llvm.SI.tid() readnone
+
+; SI-LABEL: @test_i64_vreg:
+define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.SI.tid() readnone
+  %a_ptr = getelementptr i64 addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr i64 addrspace(1)* %inB, i32 %tid
+  %a = load i64 addrspace(1)* %a_ptr
+  %b = load i64 addrspace(1)* %b_ptr
+  %result = add i64 %a, %b
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; Check that the SGPR add operand is correctly moved to a VGPR.
+; SI-LABEL: @sgpr_operand:
+define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
+  %foo = load i64 addrspace(1)* %in, align 8
+  %result = add i64 %foo, %a
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+; Swap the arguments. Check that the SGPR -> VGPR copy works with the
+; SGPR as other operand.
+;
+; SI-LABEL: @sgpr_operand_reversed:
+define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
+  %foo = load i64 addrspace(1)* %in, align 8
+  %result = add i64 %a, %foo
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
+
+; SI-LABEL: @test_v2i64_sreg:
+define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
+  %result = add <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: @test_v2i64_vreg:
+define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.SI.tid() readnone
+  %a_ptr = getelementptr <2 x i64> addrspace(1)* %inA, i32 %tid
+  %b_ptr = getelementptr <2 x i64> addrspace(1)* %inB, i32 %tid
+  %a = load <2 x i64> addrspace(1)* %a_ptr
+  %b = load <2 x i64> addrspace(1)* %b_ptr
+  %result = add <2 x i64> %a, %b
+  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/address-space.ll b/test/CodeGen/R600/address-space.ll
new file mode 100644
index 0000000..1fc616a
--- /dev/null
+++ b/test/CodeGen/R600/address-space.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
+
+; Test that codegenprepare understands address space sizes
+
+%struct.foo = type { [3 x float], [3 x float] }
+
+; CHECK-LABEL: @do_as_ptr_calcs:
+; CHECK: S_ADD_I32 {{s[0-9]+}},
+; CHECK: S_ADD_I32 [[SREG1:s[0-9]+]],
+; CHECK: V_MOV_B32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
+; CHECK: DS_READ_B32 [[VREG1]],
+define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+entry:
+  %x = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
+  %y = getelementptr inbounds %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
+  br label %bb32
+
+bb32:
+  %a = load float addrspace(3)* %x, align 4
+  %b = load float addrspace(3)* %y, align 4
+  %cmp = fcmp one float %a, %b
+  br i1 %cmp, label %bb34, label %bb33
+
+bb33:
+  unreachable
+
+bb34:
+  unreachable
+}
+
+
diff --git a/test/CodeGen/R600/and.ll b/test/CodeGen/R600/and.ll
index 44c21bd..ee9bc83 100644
--- a/test/CodeGen/R600/and.ll
+++ b/test/CodeGen/R600/and.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @test2
 ;EG-CHECK: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test2
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,16 +19,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 }
 
 ;EG-CHECK: @test4
-;EG-CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: AND_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test4
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_AND_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_AND_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/array-ptr-calc-i64.ll b/test/CodeGen/R600/array-ptr-calc-i64.ll
new file mode 100644
index 0000000..652bbfe
--- /dev/null
+++ b/test/CodeGen/R600/array-ptr-calc-i64.ll
@@ -0,0 +1,18 @@
+; XFAIL: *
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI %s
+
+declare i32 @llvm.SI.tid() readnone
+
+
+; SI-LABEL: @test_array_ptr_calc(
+define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [16 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+  %tid = call i32 @llvm.SI.tid() readnone
+  %a_ptr = getelementptr [16 x i32] addrspace(1)* %inA, i32 1, i32 %tid
+  %b_ptr = getelementptr i32 addrspace(1)* %inB, i32 %tid
+  %a = load i32 addrspace(1)* %a_ptr
+  %b = load i32 addrspace(1)* %b_ptr
+  %result = add i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/atomic_load_add.ll b/test/CodeGen/R600/atomic_load_add.ll
new file mode 100644
index 0000000..0bc48a3
--- /dev/null
+++ b/test/CodeGen/R600/atomic_load_add.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK-LABEL: @atomic_add_local
+; R600-CHECK: LDS_ADD *
+; SI-CHECK-LABEL: @atomic_add_local
+; SI-CHECK: DS_ADD_U32_RTN 0
+define void @atomic_add_local(i32 addrspace(3)* %local) {
+entry:
+   %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+   ret void
+}
+
+; R600-CHECK-LABEL: @atomic_add_ret_local
+; R600-CHECK: LDS_ADD_RET *
+; SI-CHECK-LABEL: @atomic_add_ret_local
+; SI-CHECK: DS_ADD_U32_RTN 0
+define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+entry:
+  %0 = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/atomic_load_sub.ll b/test/CodeGen/R600/atomic_load_sub.ll
new file mode 100644
index 0000000..e4a6829
--- /dev/null
+++ b/test/CodeGen/R600/atomic_load_sub.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK-LABEL: @atomic_sub_local
+; R600-CHECK: LDS_SUB *
+; SI-CHECK-LABEL: @atomic_sub_local
+; SI-CHECK: DS_SUB_U32_RTN 0
+define void @atomic_sub_local(i32 addrspace(3)* %local) {
+entry:
+   %0 = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+   ret void
+}
+
+; R600-CHECK-LABEL: @atomic_sub_ret_local
+; R600-CHECK: LDS_SUB_RET *
+; SI-CHECK-LABEL: @atomic_sub_ret_local
+; SI-CHECK: DS_SUB_U32_RTN 0
+define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+entry:
+  %0 = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/bfi_int.ll b/test/CodeGen/R600/bfi_int.ll
index cdccdfa..bbfe856 100644
--- a/test/CodeGen/R600/bfi_int.ll
+++ b/test/CodeGen/R600/bfi_int.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ; BFI_INT Definition pattern from ISA docs
 ; (y & x) | (z & ~x)
@@ -38,8 +38,8 @@ entry:
 ; R600-CHECK: @bfi_sha256_ma
 ; R600-CHECK: XOR_INT * [[DST:T[0-9]+\.[XYZW]]], KC0[2].Z, KC0[2].W
 ; R600-CHECK: BFI_INT * {{T[0-9]+\.[XYZW]}}, {{[[DST]]|PV\.[XYZW]}}, KC0[3].X, KC0[2].W
-; SI-CHECK: V_XOR_B32_e64 [[DST:VGPR[0-9]+]], {{[SV]GPR[0-9]+, VGPR[0-9]+}}
-; SI-CHECK: V_BFI_B32 {{VGPR[0-9]+}}, [[DST]], {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}
+; SI-CHECK: V_XOR_B32_e64 [[DST:v[0-9]+]], {{[sv][0-9]+, v[0-9]+}}
+; SI-CHECK: V_BFI_B32 {{v[0-9]+}}, [[DST]], {{[sv][0-9]+, [sv][0-9]+}}
 
 define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
diff --git a/test/CodeGen/R600/big_alu.ll b/test/CodeGen/R600/big_alu.ll
new file mode 100644
index 0000000..6b68376
--- /dev/null
+++ b/test/CodeGen/R600/big_alu.ll
@@ -0,0 +1,1174 @@
+;RUN: llc < %s -march=r600 -mcpu=cedar
+;REQUIRES: asserts
+
+;This test ensures that R600 backend can handle ifcvt properly
+;and do not generate ALU clauses with more than 128 instructions.
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg0, i32 0
+  %1 = extractelement <4 x float> %reg0, i32 1
+  %2 = extractelement <4 x float> %reg0, i32 2
+  %3 = extractelement <4 x float> %reg0, i32 3
+  %4 = extractelement <4 x float> %reg1, i32 0
+  %5 = extractelement <4 x float> %reg9, i32 0
+  %6 = extractelement <4 x float> %reg8, i32 0
+  %7 = fcmp ugt float %6, 0.000000e+00
+  %8 = select i1 %7, float %4, float %5
+  %9 = extractelement <4 x float> %reg1, i32 1
+  %10 = extractelement <4 x float> %reg9, i32 1
+  %11 = extractelement <4 x float> %reg8, i32 0
+  %12 = fcmp ugt float %11, 0.000000e+00
+  %13 = select i1 %12, float %9, float %10
+  %14 = extractelement <4 x float> %reg1, i32 2
+  %15 = extractelement <4 x float> %reg9, i32 2
+  %16 = extractelement <4 x float> %reg8, i32 0
+  %17 = fcmp ugt float %16, 0.000000e+00
+  %18 = select i1 %17, float %14, float %15
+  %19 = extractelement <4 x float> %reg1, i32 3
+  %20 = extractelement <4 x float> %reg9, i32 3
+  %21 = extractelement <4 x float> %reg8, i32 0
+  %22 = extractelement <4 x float> %reg2, i32 0
+  %23 = extractelement <4 x float> %reg2, i32 1
+  %24 = extractelement <4 x float> %reg2, i32 2
+  %25 = extractelement <4 x float> %reg2, i32 3
+  %26 = extractelement <4 x float> %reg3, i32 0
+  %27 = extractelement <4 x float> %reg3, i32 1
+  %28 = extractelement <4 x float> %reg3, i32 2
+  %29 = extractelement <4 x float> %reg3, i32 3
+  %30 = extractelement <4 x float> %reg4, i32 0
+  %31 = extractelement <4 x float> %reg4, i32 1
+  %32 = extractelement <4 x float> %reg4, i32 2
+  %33 = extractelement <4 x float> %reg4, i32 3
+  %34 = extractelement <4 x float> %reg5, i32 0
+  %35 = extractelement <4 x float> %reg5, i32 1
+  %36 = extractelement <4 x float> %reg5, i32 2
+  %37 = extractelement <4 x float> %reg5, i32 3
+  %38 = extractelement <4 x float> %reg6, i32 0
+  %39 = extractelement <4 x float> %reg6, i32 1
+  %40 = extractelement <4 x float> %reg6, i32 2
+  %41 = extractelement <4 x float> %reg6, i32 3
+  %42 = extractelement <4 x float> %reg7, i32 0
+  %43 = extractelement <4 x float> %reg7, i32 1
+  %44 = extractelement <4 x float> %reg7, i32 2
+  %45 = extractelement <4 x float> %reg7, i32 3
+  %46 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %47 = extractelement <4 x float> %46, i32 0
+  %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %49 = extractelement <4 x float> %48, i32 1
+  %50 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 11)
+  %51 = extractelement <4 x float> %50, i32 2
+  %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 12)
+  %53 = extractelement <4 x float> %52, i32 0
+  %54 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %55 = extractelement <4 x float> %54, i32 0
+  %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %57 = extractelement <4 x float> %56, i32 1
+  %58 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %59 = extractelement <4 x float> %58, i32 2
+  %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 14)
+  %61 = extractelement <4 x float> %60, i32 3
+  %62 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %63 = extractelement <4 x float> %62, i32 0
+  %64 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %65 = extractelement <4 x float> %64, i32 1
+  %66 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 16)
+  %67 = extractelement <4 x float> %66, i32 2
+  %68 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %69 = extractelement <4 x float> %68, i32 0
+  %70 = fcmp oge float %69, 3.500000e+00
+  %71 = sext i1 %70 to i32
+  %72 = bitcast i32 %71 to float
+  %73 = bitcast float %72 to i32
+  %74 = icmp ne i32 %73, 0
+  %. = select i1 %74, float 0.000000e+00, float 0.000000e+00
+  %75 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %76 = extractelement <4 x float> %75, i32 0
+  %77 = fcmp oge float %76, 2.000000e+00
+  %78 = sext i1 %77 to i32
+  %79 = bitcast i32 %78 to float
+  %80 = bitcast float %79 to i32
+  %81 = icmp ne i32 %80, 0
+  br i1 %81, label %IF137, label %ENDIF136
+
+IF137:                                            ; preds = %main_body
+  %82 = insertelement <4 x float> undef, float %30, i32 0
+  %83 = insertelement <4 x float> %82, float %31, i32 1
+  %84 = insertelement <4 x float> %83, float %32, i32 2
+  %85 = insertelement <4 x float> %84, float 0.000000e+00, i32 3
+  %86 = insertelement <4 x float> undef, float %30, i32 0
+  %87 = insertelement <4 x float> %86, float %31, i32 1
+  %88 = insertelement <4 x float> %87, float %32, i32 2
+  %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
+  %90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89)
+  %91 = call float @llvm.AMDGPU.rsq(float %90)
+  %92 = fmul float %30, %91
+  %93 = fmul float %31, %91
+  %94 = fmul float %32, %91
+  %95 = insertelement <4 x float> undef, float %92, i32 0
+  %96 = insertelement <4 x float> %95, float %93, i32 1
+  %97 = insertelement <4 x float> %96, float %94, i32 2
+  %98 = insertelement <4 x float> %97, float 0.000000e+00, i32 3
+  %99 = insertelement <4 x float> undef, float %37, i32 0
+  %100 = insertelement <4 x float> %99, float %38, i32 1
+  %101 = insertelement <4 x float> %100, float %39, i32 2
+  %102 = insertelement <4 x float> %101, float 0.000000e+00, i32 3
+  %103 = call float @llvm.AMDGPU.dp4(<4 x float> %98, <4 x float> %102)
+  %104 = insertelement <4 x float> undef, float %92, i32 0
+  %105 = insertelement <4 x float> %104, float %93, i32 1
+  %106 = insertelement <4 x float> %105, float %94, i32 2
+  %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 3
+  %108 = insertelement <4 x float> undef, float %40, i32 0
+  %109 = insertelement <4 x float> %108, float %41, i32 1
+  %110 = insertelement <4 x float> %109, float %42, i32 2
+  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 3
+  %112 = call float @llvm.AMDGPU.dp4(<4 x float> %107, <4 x float> %111)
+  %113 = fsub float -0.000000e+00, %92
+  %114 = fsub float -0.000000e+00, %93
+  %115 = fsub float -0.000000e+00, %94
+  %116 = insertelement <4 x float> undef, float %34, i32 0
+  %117 = insertelement <4 x float> %116, float %35, i32 1
+  %118 = insertelement <4 x float> %117, float %36, i32 2
+  %119 = insertelement <4 x float> %118, float 0.000000e+00, i32 3
+  %120 = insertelement <4 x float> undef, float %113, i32 0
+  %121 = insertelement <4 x float> %120, float %114, i32 1
+  %122 = insertelement <4 x float> %121, float %115, i32 2
+  %123 = insertelement <4 x float> %122, float 0.000000e+00, i32 3
+  %124 = call float @llvm.AMDGPU.dp4(<4 x float> %119, <4 x float> %123)
+  %125 = fdiv float 1.000000e+00, %124
+  %126 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %127 = extractelement <4 x float> %126, i32 0
+  %128 = fmul float %127, %125
+  %129 = fmul float %103, %128
+  %130 = fmul float %112, %128
+  %131 = bitcast float %. to i32
+  %132 = sitofp i32 %131 to float
+  %133 = fdiv float 1.000000e+00, %132
+  %134 = bitcast float %. to i32
+  %135 = add i32 %134, -1
+  %136 = bitcast i32 %135 to float
+  %137 = bitcast float %136 to i32
+  br label %LOOP
+
+ENDIF136:                                         ; preds = %main_body, %ENDIF154
+  %temp68.1 = phi float [ %600, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+  %temp69.0 = phi float [ %602, %ENDIF154 ], [ 0.000000e+00, %main_body ]
+  %temp70.0 = phi float [ %604, %ENDIF154 ], [ 1.000000e+00, %main_body ]
+  %138 = fmul float %26, 0x3F847AE140000000
+  %139 = fmul float %27, 0x3F847AE140000000
+  %140 = fmul float %28, 0x3F847AE140000000
+  %141 = insertelement <4 x float> undef, float %138, i32 0
+  %142 = insertelement <4 x float> %141, float %139, i32 1
+  %143 = insertelement <4 x float> %142, float %140, i32 2
+  %144 = insertelement <4 x float> %143, float 0.000000e+00, i32 3
+  %145 = extractelement <4 x float> %144, i32 0
+  %146 = extractelement <4 x float> %144, i32 1
+  %147 = extractelement <4 x float> %144, i32 2
+  %148 = extractelement <4 x float> %144, i32 3
+  %149 = insertelement <4 x float> undef, float %145, i32 0
+  %150 = insertelement <4 x float> %149, float %146, i32 1
+  %151 = insertelement <4 x float> %150, float %147, i32 2
+  %152 = insertelement <4 x float> %151, float %148, i32 3
+  %153 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %152, i32 16, i32 0, i32 3)
+  %154 = extractelement <4 x float> %153, i32 0
+  %155 = extractelement <4 x float> %153, i32 1
+  %156 = extractelement <4 x float> %153, i32 2
+  %157 = extractelement <4 x float> %153, i32 3
+  %158 = fmul float %26, 0x3F45A07B40000000
+  %159 = fmul float %27, 0x3F45A07B40000000
+  %160 = fmul float %28, 0x3F45A07B40000000
+  %161 = insertelement <4 x float> undef, float %158, i32 0
+  %162 = insertelement <4 x float> %161, float %159, i32 1
+  %163 = insertelement <4 x float> %162, float %160, i32 2
+  %164 = insertelement <4 x float> %163, float 0.000000e+00, i32 3
+  %165 = extractelement <4 x float> %164, i32 0
+  %166 = extractelement <4 x float> %164, i32 1
+  %167 = extractelement <4 x float> %164, i32 2
+  %168 = extractelement <4 x float> %164, i32 3
+  %169 = insertelement <4 x float> undef, float %165, i32 0
+  %170 = insertelement <4 x float> %169, float %166, i32 1
+  %171 = insertelement <4 x float> %170, float %167, i32 2
+  %172 = insertelement <4 x float> %171, float %168, i32 3
+  %173 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %172, i32 16, i32 0, i32 3)
+  %174 = extractelement <4 x float> %173, i32 0
+  %175 = extractelement <4 x float> %173, i32 1
+  %176 = extractelement <4 x float> %173, i32 2
+  %177 = extractelement <4 x float> %173, i32 3
+  %178 = fmul float %176, 3.000000e+03
+  %179 = fadd float %178, %28
+  %180 = fdiv float 1.000000e+00, %33
+  %181 = fmul float %32, %180
+  %182 = call float @fabs(float %181)
+  %183 = fmul float %174, 0x3FD99999A0000000
+  %184 = fadd float %183, 0x3FAEB851E0000000
+  %185 = fmul float %175, 0x3FE3333340000000
+  %186 = fadd float %185, %184
+  %187 = fmul float %176, 2.000000e+00
+  %188 = fadd float %187, %186
+  %189 = fmul float %177, 4.000000e+00
+  %190 = fadd float %189, %188
+  %191 = fmul float %154, 0x3FB99999A0000000
+  %192 = fadd float %191, %190
+  %193 = fmul float %155, 0x3FD99999A0000000
+  %194 = fadd float %193, %192
+  %195 = fmul float %156, 0x3FE99999A0000000
+  %196 = fadd float %195, %194
+  %197 = fmul float %157, 0x4000CCCCC0000000
+  %198 = fadd float %197, %196
+  %199 = fmul float 0xBE5EFB4CC0000000, %182
+  %200 = fmul float %199, %182
+  %201 = call float @llvm.AMDIL.exp.(float %200)
+  %202 = call float @llvm.AMDGPU.lrp(float %201, float %198, float 0x3FA99999A0000000)
+  %203 = fadd float %202, 0x3FF4CCCCC0000000
+  %204 = fmul float %203, 0x3FE1C71C80000000
+  %205 = call float @llvm.AMDIL.clamp.(float %204, float 0.000000e+00, float 1.000000e+00)
+  %206 = fadd float %202, 0x3FF4CCCCC0000000
+  %207 = fmul float %206, 0x3FE1C71C80000000
+  %208 = call float @llvm.AMDIL.clamp.(float %207, float 0.000000e+00, float 1.000000e+00)
+  %209 = fadd float %202, 2.000000e+00
+  %210 = fmul float %209, 0x3FD611A7A0000000
+  %211 = call float @llvm.AMDIL.clamp.(float %210, float 0.000000e+00, float 1.000000e+00)
+  %212 = fmul float 2.000000e+00, %205
+  %213 = fsub float -0.000000e+00, %212
+  %214 = fadd float 3.000000e+00, %213
+  %215 = fmul float %205, %214
+  %216 = fmul float %205, %215
+  %217 = fmul float 2.000000e+00, %208
+  %218 = fsub float -0.000000e+00, %217
+  %219 = fadd float 3.000000e+00, %218
+  %220 = fmul float %208, %219
+  %221 = fmul float %208, %220
+  %222 = fmul float 2.000000e+00, %211
+  %223 = fsub float -0.000000e+00, %222
+  %224 = fadd float 3.000000e+00, %223
+  %225 = fmul float %211, %224
+  %226 = fmul float %211, %225
+  %227 = fmul float %26, 0x3F368B5CC0000000
+  %228 = fmul float %27, 0x3F368B5CC0000000
+  %229 = insertelement <4 x float> undef, float %227, i32 0
+  %230 = insertelement <4 x float> %229, float %228, i32 1
+  %231 = insertelement <4 x float> %230, float 0.000000e+00, i32 2
+  %232 = insertelement <4 x float> %231, float 0.000000e+00, i32 3
+  %233 = extractelement <4 x float> %232, i32 0
+  %234 = extractelement <4 x float> %232, i32 1
+  %235 = insertelement <4 x float> undef, float %233, i32 0
+  %236 = insertelement <4 x float> %235, float %234, i32 1
+  %237 = insertelement <4 x float> %236, float undef, i32 2
+  %238 = insertelement <4 x float> %237, float undef, i32 3
+  %239 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %238, i32 17, i32 1, i32 2)
+  %240 = extractelement <4 x float> %239, i32 0
+  %241 = insertelement <4 x float> undef, float %240, i32 0
+  %242 = insertelement <4 x float> %241, float %228, i32 1
+  %243 = insertelement <4 x float> %242, float 0.000000e+00, i32 2
+  %244 = insertelement <4 x float> %243, float 0.000000e+00, i32 3
+  %245 = extractelement <4 x float> %244, i32 0
+  %246 = insertelement <4 x float> undef, float %245, i32 0
+  %247 = insertelement <4 x float> %246, float undef, i32 1
+  %248 = insertelement <4 x float> %247, float undef, i32 2
+  %249 = insertelement <4 x float> %248, float undef, i32 3
+  %250 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %249, i32 18, i32 2, i32 1)
+  %251 = extractelement <4 x float> %250, i32 0
+  %252 = extractelement <4 x float> %250, i32 1
+  %253 = extractelement <4 x float> %250, i32 2
+  %254 = extractelement <4 x float> %250, i32 3
+  %255 = fmul float %251, %216
+  %256 = fmul float %252, %221
+  %257 = fmul float %253, %226
+  %258 = fmul float %254, 0.000000e+00
+  %259 = fadd float %202, 0x3FF4CCCCC0000000
+  %260 = fmul float %259, 0x3FE1C71C80000000
+  %261 = call float @llvm.AMDIL.clamp.(float %260, float 0.000000e+00, float 1.000000e+00)
+  %262 = fadd float %202, 0x3FF4CCCCC0000000
+  %263 = fmul float %262, 0x3FE1C71C80000000
+  %264 = call float @llvm.AMDIL.clamp.(float %263, float 0.000000e+00, float 1.000000e+00)
+  %265 = fadd float %202, 2.000000e+00
+  %266 = fmul float %265, 0x3FD611A7A0000000
+  %267 = call float @llvm.AMDIL.clamp.(float %266, float 0.000000e+00, float 1.000000e+00)
+  %268 = fmul float 2.000000e+00, %261
+  %269 = fsub float -0.000000e+00, %268
+  %270 = fadd float 3.000000e+00, %269
+  %271 = fmul float %261, %270
+  %272 = fmul float %261, %271
+  %273 = fmul float 2.000000e+00, %264
+  %274 = fsub float -0.000000e+00, %273
+  %275 = fadd float 3.000000e+00, %274
+  %276 = fmul float %264, %275
+  %277 = fmul float %264, %276
+  %278 = fmul float 2.000000e+00, %267
+  %279 = fsub float -0.000000e+00, %278
+  %280 = fadd float 3.000000e+00, %279
+  %281 = fmul float %267, %280
+  %282 = fmul float %267, %281
+  %283 = fmul float %26, 0x3F22DFD6A0000000
+  %284 = fmul float %27, 0x3F22DFD6A0000000
+  %285 = insertelement <4 x float> undef, float %283, i32 0
+  %286 = insertelement <4 x float> %285, float %284, i32 1
+  %287 = insertelement <4 x float> %286, float 0.000000e+00, i32 2
+  %288 = insertelement <4 x float> %287, float 0.000000e+00, i32 3
+  %289 = extractelement <4 x float> %288, i32 0
+  %290 = extractelement <4 x float> %288, i32 1
+  %291 = insertelement <4 x float> undef, float %289, i32 0
+  %292 = insertelement <4 x float> %291, float %290, i32 1
+  %293 = insertelement <4 x float> %292, float undef, i32 2
+  %294 = insertelement <4 x float> %293, float undef, i32 3
+  %295 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %294, i32 19, i32 3, i32 2)
+  %296 = extractelement <4 x float> %295, i32 0
+  %297 = extractelement <4 x float> %295, i32 1
+  %298 = extractelement <4 x float> %295, i32 2
+  %299 = extractelement <4 x float> %295, i32 3
+  %300 = fmul float %296, %272
+  %301 = fmul float %297, %277
+  %302 = fmul float %298, %282
+  %303 = fmul float %299, 0.000000e+00
+  %304 = fmul float %temp68.1, %37
+  %305 = fmul float %temp68.1, %38
+  %306 = fmul float %temp68.1, %39
+  %307 = fmul float %temp69.0, %40
+  %308 = fadd float %307, %304
+  %309 = fmul float %temp69.0, %41
+  %310 = fadd float %309, %305
+  %311 = fmul float %temp69.0, %42
+  %312 = fadd float %311, %306
+  %313 = fmul float %temp70.0, %34
+  %314 = fadd float %313, %308
+  %315 = fmul float %temp70.0, %35
+  %316 = fadd float %315, %310
+  %317 = fmul float %temp70.0, %36
+  %318 = fadd float %317, %312
+  %319 = insertelement <4 x float> undef, float %314, i32 0
+  %320 = insertelement <4 x float> %319, float %316, i32 1
+  %321 = insertelement <4 x float> %320, float %318, i32 2
+  %322 = insertelement <4 x float> %321, float 0.000000e+00, i32 3
+  %323 = insertelement <4 x float> undef, float %314, i32 0
+  %324 = insertelement <4 x float> %323, float %316, i32 1
+  %325 = insertelement <4 x float> %324, float %318, i32 2
+  %326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3
+  %327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326)
+  %328 = call float @llvm.AMDGPU.rsq(float %327)
+  %329 = fmul float %314, %328
+  %330 = fmul float %316, %328
+  %331 = fmul float %318, %328
+  %332 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %333 = extractelement <4 x float> %332, i32 0
+  %334 = fsub float -0.000000e+00, %333
+  %335 = fadd float 1.000000e+00, %334
+  %336 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %337 = extractelement <4 x float> %336, i32 0
+  %338 = fsub float -0.000000e+00, %337
+  %339 = fadd float 1.000000e+00, %338
+  %340 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %341 = extractelement <4 x float> %340, i32 0
+  %342 = fsub float -0.000000e+00, %341
+  %343 = fadd float 1.000000e+00, %342
+  %344 = fsub float -0.000000e+00, %335
+  %345 = fadd float %202, %344
+  %346 = fsub float -0.000000e+00, %339
+  %347 = fadd float %202, %346
+  %348 = fadd float %347, 0xBFE3333340000000
+  %349 = fsub float -0.000000e+00, %202
+  %350 = fsub float -0.000000e+00, %343
+  %351 = fadd float %349, %350
+  %352 = insertelement <4 x float> undef, float %43, i32 0
+  %353 = insertelement <4 x float> %352, float %44, i32 1
+  %354 = insertelement <4 x float> %353, float %45, i32 2
+  %355 = insertelement <4 x float> %354, float 0.000000e+00, i32 3
+  %356 = insertelement <4 x float> undef, float %43, i32 0
+  %357 = insertelement <4 x float> %356, float %44, i32 1
+  %358 = insertelement <4 x float> %357, float %45, i32 2
+  %359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3
+  %360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359)
+  %361 = call float @llvm.AMDGPU.rsq(float %360)
+  %362 = fmul float %45, %361
+  %363 = call float @fabs(float %362)
+  %364 = fmul float %176, 0x3FECCCCCC0000000
+  %365 = fadd float %364, %363
+  %366 = fadd float %365, 0xBFEFAE1480000000
+  %367 = fmul float %366, 0xC023FFFFC0000000
+  %368 = call float @llvm.AMDIL.clamp.(float %367, float 0.000000e+00, float 1.000000e+00)
+  %369 = fsub float -0.000000e+00, %335
+  %370 = fadd float %202, %369
+  %371 = fadd float %370, 0x3FBEB851E0000000
+  %372 = fsub float -0.000000e+00, %339
+  %373 = fadd float %202, %372
+  %374 = fadd float %373, 0xBFE0A3D700000000
+  %375 = fsub float -0.000000e+00, %202
+  %376 = fsub float -0.000000e+00, %343
+  %377 = fadd float %375, %376
+  %378 = insertelement <4 x float> undef, float %43, i32 0
+  %379 = insertelement <4 x float> %378, float %44, i32 1
+  %380 = insertelement <4 x float> %379, float %45, i32 2
+  %381 = insertelement <4 x float> %380, float 0.000000e+00, i32 3
+  %382 = insertelement <4 x float> undef, float %43, i32 0
+  %383 = insertelement <4 x float> %382, float %44, i32 1
+  %384 = insertelement <4 x float> %383, float %45, i32 2
+  %385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3
+  %386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385)
+  %387 = call float @llvm.AMDGPU.rsq(float %386)
+  %388 = fmul float %45, %387
+  %389 = call float @fabs(float %388)
+  %390 = fmul float %176, 0x3FF51EB860000000
+  %391 = fadd float %390, %389
+  %392 = fadd float %391, 0xBFEFAE1480000000
+  %393 = fmul float %392, 0xC0490001A0000000
+  %394 = call float @llvm.AMDIL.clamp.(float %393, float 0.000000e+00, float 1.000000e+00)
+  %395 = fmul float 2.000000e+00, %368
+  %396 = fsub float -0.000000e+00, %395
+  %397 = fadd float 3.000000e+00, %396
+  %398 = fmul float %368, %397
+  %399 = fmul float %368, %398
+  %400 = call float @llvm.AMDGPU.lrp(float %399, float %255, float %345)
+  %401 = call float @llvm.AMDGPU.lrp(float %399, float %256, float %348)
+  %402 = call float @llvm.AMDGPU.lrp(float %399, float %257, float %351)
+  %403 = call float @llvm.AMDGPU.lrp(float %399, float %258, float 0.000000e+00)
+  %404 = fmul float 2.000000e+00, %394
+  %405 = fsub float -0.000000e+00, %404
+  %406 = fadd float 3.000000e+00, %405
+  %407 = fmul float %394, %406
+  %408 = fmul float %394, %407
+  %409 = call float @llvm.AMDGPU.lrp(float %408, float %255, float %371)
+  %410 = call float @llvm.AMDGPU.lrp(float %408, float %256, float %374)
+  %411 = call float @llvm.AMDGPU.lrp(float %408, float %257, float %377)
+  %412 = call float @llvm.AMDGPU.lrp(float %408, float %258, float 0x3FD3333340000000)
+  %413 = fcmp oge float 2.200000e+03, %179
+  %414 = sext i1 %413 to i32
+  %415 = bitcast i32 %414 to float
+  %416 = bitcast float %415 to i32
+  %417 = icmp ne i32 %416, 0
+  br i1 %417, label %IF161, label %ENDIF160
+
+LOOP:                                             ; preds = %ENDIF139, %IF137
+  %temp88.0 = phi float [ 0.000000e+00, %IF137 ], [ %446, %ENDIF139 ]
+  %temp92.0 = phi float [ 1.000000e+00, %IF137 ], [ %.temp92.0, %ENDIF139 ]
+  %temp96.0 = phi float [ 0.000000e+00, %IF137 ], [ %477, %ENDIF139 ]
+  %418 = bitcast float %temp96.0 to i32
+  %419 = icmp sge i32 %418, %137
+  %420 = sext i1 %419 to i32
+  %421 = bitcast i32 %420 to float
+  %422 = bitcast float %421 to i32
+  %423 = icmp ne i32 %422, 0
+  br i1 %423, label %IF140, label %ENDIF139
+
+IF140:                                            ; preds = %LOOP
+  %424 = fmul float %133, 5.000000e-01
+  %425 = fmul float %129, %temp92.0
+  %426 = fadd float %425, %22
+  %427 = fmul float %130, %temp92.0
+  %428 = fadd float %427, %23
+  %429 = insertelement <4 x float> undef, float %426, i32 0
+  %430 = insertelement <4 x float> %429, float %428, i32 1
+  %431 = insertelement <4 x float> %430, float 0.000000e+00, i32 2
+  %432 = insertelement <4 x float> %431, float 0.000000e+00, i32 3
+  %433 = extractelement <4 x float> %432, i32 0
+  %434 = extractelement <4 x float> %432, i32 1
+  %435 = insertelement <4 x float> undef, float %433, i32 0
+  %436 = insertelement <4 x float> %435, float %434, i32 1
+  %437 = insertelement <4 x float> %436, float undef, i32 2
+  %438 = insertelement <4 x float> %437, float undef, i32 3
+  %439 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %438, i32 20, i32 4, i32 2)
+  %440 = extractelement <4 x float> %439, i32 3
+  %441 = fcmp oge float %temp92.0, %440
+  %442 = sext i1 %441 to i32
+  %443 = bitcast i32 %442 to float
+  %444 = bitcast float %443 to i32
+  %445 = icmp ne i32 %444, 0
+  br i1 %445, label %IF146, label %ENDIF145
+
+ENDIF139:                                         ; preds = %LOOP
+  %446 = fadd float %temp88.0, %133
+  %447 = fmul float %129, %446
+  %448 = fadd float %447, %22
+  %449 = fmul float %130, %446
+  %450 = fadd float %449, %23
+  %451 = insertelement <4 x float> undef, float %448, i32 0
+  %452 = insertelement <4 x float> %451, float %450, i32 1
+  %453 = insertelement <4 x float> %452, float 0.000000e+00, i32 2
+  %454 = insertelement <4 x float> %453, float 0.000000e+00, i32 3
+  %455 = extractelement <4 x float> %454, i32 0
+  %456 = extractelement <4 x float> %454, i32 1
+  %457 = insertelement <4 x float> undef, float %455, i32 0
+  %458 = insertelement <4 x float> %457, float %456, i32 1
+  %459 = insertelement <4 x float> %458, float undef, i32 2
+  %460 = insertelement <4 x float> %459, float undef, i32 3
+  %461 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %460, i32 20, i32 4, i32 2)
+  %462 = extractelement <4 x float> %461, i32 3
+  %463 = fcmp olt float 0x3FEFDF3B60000000, %temp92.0
+  %464 = sext i1 %463 to i32
+  %465 = bitcast i32 %464 to float
+  %466 = fcmp oge float %446, %462
+  %467 = sext i1 %466 to i32
+  %468 = bitcast i32 %467 to float
+  %469 = bitcast float %465 to i32
+  %470 = bitcast float %468 to i32
+  %471 = and i32 %469, %470
+  %472 = bitcast i32 %471 to float
+  %473 = bitcast float %472 to i32
+  %474 = icmp ne i32 %473, 0
+  %.temp92.0 = select i1 %474, float %446, float %temp92.0
+  %475 = bitcast float %temp96.0 to i32
+  %476 = add i32 %475, 1
+  %477 = bitcast i32 %476 to float
+  br label %LOOP
+
+IF146:                                            ; preds = %IF140
+  %478 = fmul float 2.000000e+00, %424
+  %479 = fsub float -0.000000e+00, %478
+  %480 = fadd float %temp92.0, %479
+  br label %ENDIF145
+
+ENDIF145:                                         ; preds = %IF140, %IF146
+  %temp88.1 = phi float [ %480, %IF146 ], [ %temp92.0, %IF140 ]
+  %481 = fadd float %temp88.1, %424
+  %482 = fmul float %424, 5.000000e-01
+  %483 = fmul float %129, %481
+  %484 = fadd float %483, %22
+  %485 = fmul float %130, %481
+  %486 = fadd float %485, %23
+  %487 = insertelement <4 x float> undef, float %484, i32 0
+  %488 = insertelement <4 x float> %487, float %486, i32 1
+  %489 = insertelement <4 x float> %488, float 0.000000e+00, i32 2
+  %490 = insertelement <4 x float> %489, float %440, i32 3
+  %491 = extractelement <4 x float> %490, i32 0
+  %492 = extractelement <4 x float> %490, i32 1
+  %493 = insertelement <4 x float> undef, float %491, i32 0
+  %494 = insertelement <4 x float> %493, float %492, i32 1
+  %495 = insertelement <4 x float> %494, float undef, i32 2
+  %496 = insertelement <4 x float> %495, float undef, i32 3
+  %497 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %496, i32 20, i32 4, i32 2)
+  %498 = extractelement <4 x float> %497, i32 3
+  %499 = fcmp oge float %481, %498
+  %500 = sext i1 %499 to i32
+  %501 = bitcast i32 %500 to float
+  %502 = bitcast float %501 to i32
+  %503 = icmp ne i32 %502, 0
+  br i1 %503, label %IF149, label %ENDIF148
+
+IF149:                                            ; preds = %ENDIF145
+  %504 = fmul float 2.000000e+00, %482
+  %505 = fsub float -0.000000e+00, %504
+  %506 = fadd float %481, %505
+  br label %ENDIF148
+
+ENDIF148:                                         ; preds = %ENDIF145, %IF149
+  %temp88.2 = phi float [ %506, %IF149 ], [ %481, %ENDIF145 ]
+  %temp92.2 = phi float [ %481, %IF149 ], [ %temp92.0, %ENDIF145 ]
+  %507 = fadd float %temp88.2, %482
+  %508 = fmul float %482, 5.000000e-01
+  %509 = fmul float %129, %507
+  %510 = fadd float %509, %22
+  %511 = fmul float %130, %507
+  %512 = fadd float %511, %23
+  %513 = insertelement <4 x float> undef, float %510, i32 0
+  %514 = insertelement <4 x float> %513, float %512, i32 1
+  %515 = insertelement <4 x float> %514, float 0.000000e+00, i32 2
+  %516 = insertelement <4 x float> %515, float %498, i32 3
+  %517 = extractelement <4 x float> %516, i32 0
+  %518 = extractelement <4 x float> %516, i32 1
+  %519 = insertelement <4 x float> undef, float %517, i32 0
+  %520 = insertelement <4 x float> %519, float %518, i32 1
+  %521 = insertelement <4 x float> %520, float undef, i32 2
+  %522 = insertelement <4 x float> %521, float undef, i32 3
+  %523 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %522, i32 20, i32 4, i32 2)
+  %524 = extractelement <4 x float> %523, i32 3
+  %525 = fcmp oge float %507, %524
+  %526 = sext i1 %525 to i32
+  %527 = bitcast i32 %526 to float
+  %528 = bitcast float %527 to i32
+  %529 = icmp ne i32 %528, 0
+  br i1 %529, label %IF152, label %ENDIF151
+
+IF152:                                            ; preds = %ENDIF148
+  %530 = fmul float 2.000000e+00, %508
+  %531 = fsub float -0.000000e+00, %530
+  %532 = fadd float %507, %531
+  br label %ENDIF151
+
+ENDIF151:                                         ; preds = %ENDIF148, %IF152
+  %temp88.3 = phi float [ %532, %IF152 ], [ %507, %ENDIF148 ]
+  %temp92.3 = phi float [ %507, %IF152 ], [ %temp92.2, %ENDIF148 ]
+  %533 = fadd float %temp88.3, %508
+  %534 = fmul float %508, 5.000000e-01
+  %535 = fmul float %129, %533
+  %536 = fadd float %535, %22
+  %537 = fmul float %130, %533
+  %538 = fadd float %537, %23
+  %539 = insertelement <4 x float> undef, float %536, i32 0
+  %540 = insertelement <4 x float> %539, float %538, i32 1
+  %541 = insertelement <4 x float> %540, float 0.000000e+00, i32 2
+  %542 = insertelement <4 x float> %541, float %524, i32 3
+  %543 = extractelement <4 x float> %542, i32 0
+  %544 = extractelement <4 x float> %542, i32 1
+  %545 = insertelement <4 x float> undef, float %543, i32 0
+  %546 = insertelement <4 x float> %545, float %544, i32 1
+  %547 = insertelement <4 x float> %546, float undef, i32 2
+  %548 = insertelement <4 x float> %547, float undef, i32 3
+  %549 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %548, i32 20, i32 4, i32 2)
+  %550 = extractelement <4 x float> %549, i32 3
+  %551 = fcmp oge float %533, %550
+  %552 = sext i1 %551 to i32
+  %553 = bitcast i32 %552 to float
+  %554 = bitcast float %553 to i32
+  %555 = icmp ne i32 %554, 0
+  br i1 %555, label %IF155, label %ENDIF154
+
+IF155:                                            ; preds = %ENDIF151
+  %556 = fmul float 2.000000e+00, %534
+  %557 = fsub float -0.000000e+00, %556
+  %558 = fadd float %533, %557
+  br label %ENDIF154
+
+ENDIF154:                                         ; preds = %ENDIF151, %IF155
+  %temp88.4 = phi float [ %558, %IF155 ], [ %533, %ENDIF151 ]
+  %temp92.4 = phi float [ %533, %IF155 ], [ %temp92.3, %ENDIF151 ]
+  %559 = fadd float %temp88.4, %534
+  %560 = fmul float %129, %559
+  %561 = fadd float %560, %22
+  %562 = fmul float %130, %559
+  %563 = fadd float %562, %23
+  %564 = insertelement <4 x float> undef, float %561, i32 0
+  %565 = insertelement <4 x float> %564, float %563, i32 1
+  %566 = insertelement <4 x float> %565, float 0.000000e+00, i32 2
+  %567 = insertelement <4 x float> %566, float %550, i32 3
+  %568 = extractelement <4 x float> %567, i32 0
+  %569 = extractelement <4 x float> %567, i32 1
+  %570 = insertelement <4 x float> undef, float %568, i32 0
+  %571 = insertelement <4 x float> %570, float %569, i32 1
+  %572 = insertelement <4 x float> %571, float undef, i32 2
+  %573 = insertelement <4 x float> %572, float undef, i32 3
+  %574 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %573, i32 20, i32 4, i32 2)
+  %575 = extractelement <4 x float> %574, i32 3
+  %576 = fcmp oge float %559, %575
+  %577 = sext i1 %576 to i32
+  %578 = bitcast i32 %577 to float
+  %579 = bitcast float %578 to i32
+  %580 = icmp ne i32 %579, 0
+  %.temp92.4 = select i1 %580, float %559, float %temp92.4
+  %581 = fmul float %129, %.temp92.4
+  %582 = fadd float %581, %22
+  %583 = fmul float %130, %.temp92.4
+  %584 = fadd float %583, %23
+  %585 = insertelement <4 x float> undef, float %582, i32 0
+  %586 = insertelement <4 x float> %585, float %584, i32 1
+  %587 = insertelement <4 x float> %586, float 0.000000e+00, i32 2
+  %588 = insertelement <4 x float> %587, float %575, i32 3
+  %589 = extractelement <4 x float> %588, i32 0
+  %590 = extractelement <4 x float> %588, i32 1
+  %591 = insertelement <4 x float> undef, float %589, i32 0
+  %592 = insertelement <4 x float> %591, float %590, i32 1
+  %593 = insertelement <4 x float> %592, float undef, i32 2
+  %594 = insertelement <4 x float> %593, float undef, i32 3
+  %595 = call <4 x float> @llvm.AMDGPU.tex(<4 x float> %594, i32 20, i32 4, i32 2)
+  %596 = extractelement <4 x float> %595, i32 0
+  %597 = extractelement <4 x float> %595, i32 1
+  %598 = extractelement <4 x float> %595, i32 2
+  %599 = fmul float %596, 2.000000e+00
+  %600 = fadd float %599, -1.000000e+00
+  %601 = fmul float %597, 2.000000e+00
+  %602 = fadd float %601, -1.000000e+00
+  %603 = fmul float %598, 2.000000e+00
+  %604 = fadd float %603, -1.000000e+00
+  br label %ENDIF136
+
+IF161:                                            ; preds = %ENDIF136
+  %605 = fmul float %202, 0x3FB99999A0000000
+  %606 = fcmp uge float 0x3FE4CCCCC0000000, %605
+  %607 = select i1 %606, float 0x3FE4CCCCC0000000, float %605
+  %608 = fcmp uge float %607, 5.000000e-01
+  %609 = select i1 %608, float 5.000000e-01, float %607
+  %610 = call float @llvm.AMDGPU.lrp(float %609, float %400, float %300)
+  %611 = call float @llvm.AMDGPU.lrp(float %609, float %401, float %301)
+  %612 = call float @llvm.AMDGPU.lrp(float %609, float %402, float %302)
+  %613 = call float @llvm.AMDGPU.lrp(float %609, float %403, float %303)
+  %614 = insertelement <4 x float> undef, float %329, i32 0
+  %615 = insertelement <4 x float> %614, float %330, i32 1
+  %616 = insertelement <4 x float> %615, float %331, i32 2
+  %617 = insertelement <4 x float> %616, float 0.000000e+00, i32 3
+  %618 = insertelement <4 x float> undef, float %63, i32 0
+  %619 = insertelement <4 x float> %618, float %65, i32 1
+  %620 = insertelement <4 x float> %619, float %67, i32 2
+  %621 = insertelement <4 x float> %620, float 0.000000e+00, i32 3
+  %622 = call float @llvm.AMDGPU.dp4(<4 x float> %617, <4 x float> %621)
+  %623 = fcmp uge float 0x3FE6666660000000, %622
+  %624 = select i1 %623, float 0x3FE6666660000000, float %622
+  %625 = fmul float %8, %624
+  %626 = fmul float %13, %624
+  %627 = fmul float %18, %624
+  %628 = insertelement <4 x float> undef, float %34, i32 0
+  %629 = insertelement <4 x float> %628, float %35, i32 1
+  %630 = insertelement <4 x float> %629, float %36, i32 2
+  %631 = insertelement <4 x float> %630, float 0.000000e+00, i32 3
+  %632 = insertelement <4 x float> undef, float %63, i32 0
+  %633 = insertelement <4 x float> %632, float %65, i32 1
+  %634 = insertelement <4 x float> %633, float %67, i32 2
+  %635 = insertelement <4 x float> %634, float 0.000000e+00, i32 3
+  %636 = call float @llvm.AMDGPU.dp4(<4 x float> %631, <4 x float> %635)
+  %637 = fcmp uge float 0x3FECCCCCC0000000, %636
+  %638 = select i1 %637, float 0x3FECCCCCC0000000, float %636
+  %639 = fmul float %625, %638
+  %640 = fmul float %626, %638
+  %641 = fmul float %627, %638
+  br label %ENDIF160
+
+ENDIF160:                                         ; preds = %ENDIF136, %IF161
+  %temp84.0 = phi float [ %610, %IF161 ], [ %255, %ENDIF136 ]
+  %temp85.0 = phi float [ %611, %IF161 ], [ %256, %ENDIF136 ]
+  %temp86.0 = phi float [ %612, %IF161 ], [ %257, %ENDIF136 ]
+  %temp87.0 = phi float [ %613, %IF161 ], [ %258, %ENDIF136 ]
+  %temp92.6 = phi float [ %639, %IF161 ], [ %415, %ENDIF136 ]
+  %temp93.0 = phi float [ %640, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+  %temp94.0 = phi float [ %641, %IF161 ], [ 0.000000e+00, %ENDIF136 ]
+  %642 = fcmp olt float 2.200000e+03, %179
+  %643 = sext i1 %642 to i32
+  %644 = bitcast i32 %643 to float
+  %645 = fcmp olt float %179, 2.300000e+03
+  %646 = sext i1 %645 to i32
+  %647 = bitcast i32 %646 to float
+  %648 = bitcast float %644 to i32
+  %649 = bitcast float %647 to i32
+  %650 = and i32 %648, %649
+  %651 = bitcast i32 %650 to float
+  %652 = bitcast float %651 to i32
+  %653 = icmp ne i32 %652, 0
+  br i1 %653, label %IF164, label %ENDIF163
+
+IF164:                                            ; preds = %ENDIF160
+  %654 = fmul float %202, 5.000000e-01
+  %655 = fcmp uge float 0x3FE4CCCCC0000000, %654
+  %656 = select i1 %655, float 0x3FE4CCCCC0000000, float %654
+  %657 = fcmp uge float %656, 0x3FD6666660000000
+  %658 = select i1 %657, float 0x3FD6666660000000, float %656
+  %659 = call float @llvm.AMDGPU.lrp(float %658, float %400, float %300)
+  %660 = call float @llvm.AMDGPU.lrp(float %658, float %401, float %301)
+  %661 = call float @llvm.AMDGPU.lrp(float %658, float %402, float %302)
+  %662 = call float @llvm.AMDGPU.lrp(float %658, float %403, float %303)
+  %663 = insertelement <4 x float> undef, float %329, i32 0
+  %664 = insertelement <4 x float> %663, float %330, i32 1
+  %665 = insertelement <4 x float> %664, float %331, i32 2
+  %666 = insertelement <4 x float> %665, float 0.000000e+00, i32 3
+  %667 = insertelement <4 x float> undef, float %63, i32 0
+  %668 = insertelement <4 x float> %667, float %65, i32 1
+  %669 = insertelement <4 x float> %668, float %67, i32 2
+  %670 = insertelement <4 x float> %669, float 0.000000e+00, i32 3
+  %671 = call float @llvm.AMDGPU.dp4(<4 x float> %666, <4 x float> %670)
+  %672 = fcmp uge float 0x3FE6666660000000, %671
+  %673 = select i1 %672, float 0x3FE6666660000000, float %671
+  %674 = fmul float %8, %673
+  %675 = fmul float %13, %673
+  %676 = fmul float %18, %673
+  %677 = insertelement <4 x float> undef, float %34, i32 0
+  %678 = insertelement <4 x float> %677, float %35, i32 1
+  %679 = insertelement <4 x float> %678, float %36, i32 2
+  %680 = insertelement <4 x float> %679, float 0.000000e+00, i32 3
+  %681 = insertelement <4 x float> undef, float %63, i32 0
+  %682 = insertelement <4 x float> %681, float %65, i32 1
+  %683 = insertelement <4 x float> %682, float %67, i32 2
+  %684 = insertelement <4 x float> %683, float 0.000000e+00, i32 3
+  %685 = call float @llvm.AMDGPU.dp4(<4 x float> %680, <4 x float> %684)
+  %686 = fcmp uge float 0x3FECCCCCC0000000, %685
+  %687 = select i1 %686, float 0x3FECCCCCC0000000, float %685
+  %688 = fmul float %674, %687
+  %689 = fmul float %675, %687
+  %690 = fmul float %676, %687
+  br label %ENDIF163
+
+ENDIF163:                                         ; preds = %ENDIF160, %IF164
+  %temp84.1 = phi float [ %659, %IF164 ], [ %temp84.0, %ENDIF160 ]
+  %temp85.1 = phi float [ %660, %IF164 ], [ %temp85.0, %ENDIF160 ]
+  %temp86.1 = phi float [ %661, %IF164 ], [ %temp86.0, %ENDIF160 ]
+  %temp87.1 = phi float [ %662, %IF164 ], [ %temp87.0, %ENDIF160 ]
+  %temp92.7 = phi float [ %688, %IF164 ], [ %temp92.6, %ENDIF160 ]
+  %temp93.1 = phi float [ %689, %IF164 ], [ %temp93.0, %ENDIF160 ]
+  %temp94.1 = phi float [ %690, %IF164 ], [ %temp94.0, %ENDIF160 ]
+  %691 = fcmp oge float %179, 2.300000e+03
+  %692 = sext i1 %691 to i32
+  %693 = bitcast i32 %692 to float
+  %694 = fcmp olt float %179, 2.480000e+03
+  %695 = sext i1 %694 to i32
+  %696 = bitcast i32 %695 to float
+  %697 = bitcast float %693 to i32
+  %698 = bitcast float %696 to i32
+  %699 = and i32 %697, %698
+  %700 = bitcast i32 %699 to float
+  %701 = bitcast float %700 to i32
+  %702 = icmp ne i32 %701, 0
+  br i1 %702, label %IF167, label %ENDIF166
+
+IF167:                                            ; preds = %ENDIF163
+  %703 = fmul float %202, 5.000000e-01
+  %704 = fcmp uge float 0x3FE4CCCCC0000000, %703
+  %705 = select i1 %704, float 0x3FE4CCCCC0000000, float %703
+  %706 = fcmp uge float %705, 0x3FD3333340000000
+  %707 = select i1 %706, float 0x3FD3333340000000, float %705
+  %708 = call float @llvm.AMDGPU.lrp(float %707, float %409, float %300)
+  %709 = call float @llvm.AMDGPU.lrp(float %707, float %410, float %301)
+  %710 = call float @llvm.AMDGPU.lrp(float %707, float %411, float %302)
+  %711 = call float @llvm.AMDGPU.lrp(float %707, float %412, float %303)
+  %712 = insertelement <4 x float> undef, float %329, i32 0
+  %713 = insertelement <4 x float> %712, float %330, i32 1
+  %714 = insertelement <4 x float> %713, float %331, i32 2
+  %715 = insertelement <4 x float> %714, float 0.000000e+00, i32 3
+  %716 = insertelement <4 x float> undef, float %63, i32 0
+  %717 = insertelement <4 x float> %716, float %65, i32 1
+  %718 = insertelement <4 x float> %717, float %67, i32 2
+  %719 = insertelement <4 x float> %718, float 0.000000e+00, i32 3
+  %720 = call float @llvm.AMDGPU.dp4(<4 x float> %715, <4 x float> %719)
+  %721 = fcmp uge float 0x3FEB333340000000, %720
+  %722 = select i1 %721, float 0x3FEB333340000000, float %720
+  %723 = fmul float %8, %722
+  %724 = fmul float %13, %722
+  %725 = fmul float %18, %722
+  %726 = insertelement <4 x float> undef, float %34, i32 0
+  %727 = insertelement <4 x float> %726, float %35, i32 1
+  %728 = insertelement <4 x float> %727, float %36, i32 2
+  %729 = insertelement <4 x float> %728, float 0.000000e+00, i32 3
+  %730 = insertelement <4 x float> undef, float %63, i32 0
+  %731 = insertelement <4 x float> %730, float %65, i32 1
+  %732 = insertelement <4 x float> %731, float %67, i32 2
+  %733 = insertelement <4 x float> %732, float 0.000000e+00, i32 3
+  %734 = call float @llvm.AMDGPU.dp4(<4 x float> %729, <4 x float> %733)
+  %735 = fcmp uge float 0x3FECCCCCC0000000, %734
+  %736 = select i1 %735, float 0x3FECCCCCC0000000, float %734
+  %737 = fmul float %723, %736
+  %738 = fmul float %724, %736
+  %739 = fmul float %725, %736
+  br label %ENDIF166
+
+ENDIF166:                                         ; preds = %ENDIF163, %IF167
+  %temp84.2 = phi float [ %708, %IF167 ], [ %temp84.1, %ENDIF163 ]
+  %temp85.2 = phi float [ %709, %IF167 ], [ %temp85.1, %ENDIF163 ]
+  %temp86.2 = phi float [ %710, %IF167 ], [ %temp86.1, %ENDIF163 ]
+  %temp87.2 = phi float [ %711, %IF167 ], [ %temp87.1, %ENDIF163 ]
+  %temp92.8 = phi float [ %737, %IF167 ], [ %temp92.7, %ENDIF163 ]
+  %temp93.2 = phi float [ %738, %IF167 ], [ %temp93.1, %ENDIF163 ]
+  %temp94.2 = phi float [ %739, %IF167 ], [ %temp94.1, %ENDIF163 ]
+  %740 = fcmp oge float %179, 2.480000e+03
+  %741 = sext i1 %740 to i32
+  %742 = bitcast i32 %741 to float
+  %743 = fcmp olt float %179, 2.530000e+03
+  %744 = sext i1 %743 to i32
+  %745 = bitcast i32 %744 to float
+  %746 = bitcast float %742 to i32
+  %747 = bitcast float %745 to i32
+  %748 = and i32 %746, %747
+  %749 = bitcast i32 %748 to float
+  %750 = bitcast float %749 to i32
+  %751 = icmp ne i32 %750, 0
+  br i1 %751, label %IF170, label %ENDIF169
+
+IF170:                                            ; preds = %ENDIF166
+  %752 = fmul float %202, 5.000000e-01
+  %753 = fcmp uge float 0x3FE4CCCCC0000000, %752
+  %754 = select i1 %753, float 0x3FE4CCCCC0000000, float %752
+  %755 = fcmp uge float %754, 0x3FC99999A0000000
+  %756 = select i1 %755, float 0x3FC99999A0000000, float %754
+  %757 = call float @llvm.AMDGPU.lrp(float %756, float %409, float %300)
+  %758 = call float @llvm.AMDGPU.lrp(float %756, float %410, float %301)
+  %759 = call float @llvm.AMDGPU.lrp(float %756, float %411, float %302)
+  %760 = call float @llvm.AMDGPU.lrp(float %756, float %412, float %303)
+  %761 = insertelement <4 x float> undef, float %329, i32 0
+  %762 = insertelement <4 x float> %761, float %330, i32 1
+  %763 = insertelement <4 x float> %762, float %331, i32 2
+  %764 = insertelement <4 x float> %763, float 0.000000e+00, i32 3
+  %765 = insertelement <4 x float> undef, float %63, i32 0
+  %766 = insertelement <4 x float> %765, float %65, i32 1
+  %767 = insertelement <4 x float> %766, float %67, i32 2
+  %768 = insertelement <4 x float> %767, float 0.000000e+00, i32 3
+  %769 = call float @llvm.AMDGPU.dp4(<4 x float> %764, <4 x float> %768)
+  %770 = fcmp uge float 0x3FEB333340000000, %769
+  %771 = select i1 %770, float 0x3FEB333340000000, float %769
+  %772 = fmul float %8, %771
+  %773 = fmul float %13, %771
+  %774 = fmul float %18, %771
+  %775 = insertelement <4 x float> undef, float %34, i32 0
+  %776 = insertelement <4 x float> %775, float %35, i32 1
+  %777 = insertelement <4 x float> %776, float %36, i32 2
+  %778 = insertelement <4 x float> %777, float 0.000000e+00, i32 3
+  %779 = insertelement <4 x float> undef, float %63, i32 0
+  %780 = insertelement <4 x float> %779, float %65, i32 1
+  %781 = insertelement <4 x float> %780, float %67, i32 2
+  %782 = insertelement <4 x float> %781, float 0.000000e+00, i32 3
+  %783 = call float @llvm.AMDGPU.dp4(<4 x float> %778, <4 x float> %782)
+  %784 = fcmp uge float 0x3FECCCCCC0000000, %783
+  %785 = select i1 %784, float 0x3FECCCCCC0000000, float %783
+  %786 = fmul float %772, %785
+  %787 = fmul float %773, %785
+  %788 = fmul float %774, %785
+  br label %ENDIF169
+
+ENDIF169:                                         ; preds = %ENDIF166, %IF170
+  %temp84.3 = phi float [ %757, %IF170 ], [ %temp84.2, %ENDIF166 ]
+  %temp85.3 = phi float [ %758, %IF170 ], [ %temp85.2, %ENDIF166 ]
+  %temp86.3 = phi float [ %759, %IF170 ], [ %temp86.2, %ENDIF166 ]
+  %temp87.3 = phi float [ %760, %IF170 ], [ %temp87.2, %ENDIF166 ]
+  %temp92.9 = phi float [ %786, %IF170 ], [ %temp92.8, %ENDIF166 ]
+  %temp93.3 = phi float [ %787, %IF170 ], [ %temp93.2, %ENDIF166 ]
+  %temp94.3 = phi float [ %788, %IF170 ], [ %temp94.2, %ENDIF166 ]
+  %789 = fcmp oge float %179, 2.530000e+03
+  %790 = sext i1 %789 to i32
+  %791 = bitcast i32 %790 to float
+  %792 = fcmp olt float %179, 2.670000e+03
+  %793 = sext i1 %792 to i32
+  %794 = bitcast i32 %793 to float
+  %795 = bitcast float %791 to i32
+  %796 = bitcast float %794 to i32
+  %797 = and i32 %795, %796
+  %798 = bitcast i32 %797 to float
+  %799 = bitcast float %798 to i32
+  %800 = icmp ne i32 %799, 0
+  br i1 %800, label %IF173, label %ENDIF172
+
+IF173:                                            ; preds = %ENDIF169
+  %801 = fmul float %202, 5.000000e-01
+  %802 = fcmp uge float 0x3FE4CCCCC0000000, %801
+  %803 = select i1 %802, float 0x3FE4CCCCC0000000, float %801
+  %804 = fcmp uge float %803, 0x3FB99999A0000000
+  %805 = select i1 %804, float 0x3FB99999A0000000, float %803
+  %806 = call float @llvm.AMDGPU.lrp(float %805, float %400, float %300)
+  %807 = call float @llvm.AMDGPU.lrp(float %805, float %401, float %301)
+  %808 = call float @llvm.AMDGPU.lrp(float %805, float %402, float %302)
+  %809 = call float @llvm.AMDGPU.lrp(float %805, float %403, float %303)
+  %810 = insertelement <4 x float> undef, float %329, i32 0
+  %811 = insertelement <4 x float> %810, float %330, i32 1
+  %812 = insertelement <4 x float> %811, float %331, i32 2
+  %813 = insertelement <4 x float> %812, float 0.000000e+00, i32 3
+  %814 = insertelement <4 x float> undef, float %63, i32 0
+  %815 = insertelement <4 x float> %814, float %65, i32 1
+  %816 = insertelement <4 x float> %815, float %67, i32 2
+  %817 = insertelement <4 x float> %816, float 0.000000e+00, i32 3
+  %818 = call float @llvm.AMDGPU.dp4(<4 x float> %813, <4 x float> %817)
+  %819 = fcmp uge float 0x3FEB333340000000, %818
+  %820 = select i1 %819, float 0x3FEB333340000000, float %818
+  %821 = fmul float %8, %820
+  %822 = fmul float %13, %820
+  %823 = fmul float %18, %820
+  %824 = insertelement <4 x float> undef, float %34, i32 0
+  %825 = insertelement <4 x float> %824, float %35, i32 1
+  %826 = insertelement <4 x float> %825, float %36, i32 2
+  %827 = insertelement <4 x float> %826, float 0.000000e+00, i32 3
+  %828 = insertelement <4 x float> undef, float %63, i32 0
+  %829 = insertelement <4 x float> %828, float %65, i32 1
+  %830 = insertelement <4 x float> %829, float %67, i32 2
+  %831 = insertelement <4 x float> %830, float 0.000000e+00, i32 3
+  %832 = call float @llvm.AMDGPU.dp4(<4 x float> %827, <4 x float> %831)
+  %833 = fcmp uge float 0x3FECCCCCC0000000, %832
+  %834 = select i1 %833, float 0x3FECCCCCC0000000, float %832
+  %835 = fmul float %821, %834
+  %836 = fmul float %822, %834
+  %837 = fmul float %823, %834
+  br label %ENDIF172
+
+ENDIF172:                                         ; preds = %ENDIF169, %IF173
+  %temp84.4 = phi float [ %806, %IF173 ], [ %temp84.3, %ENDIF169 ]
+  %temp85.4 = phi float [ %807, %IF173 ], [ %temp85.3, %ENDIF169 ]
+  %temp86.4 = phi float [ %808, %IF173 ], [ %temp86.3, %ENDIF169 ]
+  %temp87.4 = phi float [ %809, %IF173 ], [ %temp87.3, %ENDIF169 ]
+  %temp92.10 = phi float [ %835, %IF173 ], [ %temp92.9, %ENDIF169 ]
+  %temp93.4 = phi float [ %836, %IF173 ], [ %temp93.3, %ENDIF169 ]
+  %temp94.4 = phi float [ %837, %IF173 ], [ %temp94.3, %ENDIF169 ]
+  %838 = fcmp oge float %179, 2.670000e+03
+  %839 = sext i1 %838 to i32
+  %840 = bitcast i32 %839 to float
+  %841 = bitcast float %840 to i32
+  %842 = icmp ne i32 %841, 0
+  br i1 %842, label %IF176, label %ENDIF175
+
+IF176:                                            ; preds = %ENDIF172
+  %843 = fmul float %202, 0x3FB99999A0000000
+  %844 = fcmp uge float 0.000000e+00, %843
+  %845 = select i1 %844, float 0.000000e+00, float %843
+  %846 = fcmp uge float %845, 0x3FD99999A0000000
+  %847 = select i1 %846, float 0x3FD99999A0000000, float %845
+  %848 = call float @llvm.AMDGPU.lrp(float %847, float %400, float %300)
+  %849 = call float @llvm.AMDGPU.lrp(float %847, float %401, float %301)
+  %850 = call float @llvm.AMDGPU.lrp(float %847, float %402, float %302)
+  %851 = call float @llvm.AMDGPU.lrp(float %847, float %403, float %303)
+  %852 = insertelement <4 x float> undef, float %329, i32 0
+  %853 = insertelement <4 x float> %852, float %330, i32 1
+  %854 = insertelement <4 x float> %853, float %331, i32 2
+  %855 = insertelement <4 x float> %854, float 0.000000e+00, i32 3
+  %856 = insertelement <4 x float> undef, float %63, i32 0
+  %857 = insertelement <4 x float> %856, float %65, i32 1
+  %858 = insertelement <4 x float> %857, float %67, i32 2
+  %859 = insertelement <4 x float> %858, float 0.000000e+00, i32 3
+  %860 = call float @llvm.AMDGPU.dp4(<4 x float> %855, <4 x float> %859)
+  %861 = fcmp uge float 0x3FEB333340000000, %860
+  %862 = select i1 %861, float 0x3FEB333340000000, float %860
+  %863 = fmul float %8, %862
+  %864 = fmul float %13, %862
+  %865 = fmul float %18, %862
+  %866 = insertelement <4 x float> undef, float %34, i32 0
+  %867 = insertelement <4 x float> %866, float %35, i32 1
+  %868 = insertelement <4 x float> %867, float %36, i32 2
+  %869 = insertelement <4 x float> %868, float 0.000000e+00, i32 3
+  %870 = insertelement <4 x float> undef, float %63, i32 0
+  %871 = insertelement <4 x float> %870, float %65, i32 1
+  %872 = insertelement <4 x float> %871, float %67, i32 2
+  %873 = insertelement <4 x float> %872, float 0.000000e+00, i32 3
+  %874 = call float @llvm.AMDGPU.dp4(<4 x float> %869, <4 x float> %873)
+  %875 = fcmp uge float 0x3FECCCCCC0000000, %874
+  %876 = select i1 %875, float 0x3FECCCCCC0000000, float %874
+  %877 = fmul float %863, %876
+  %878 = fmul float %864, %876
+  %879 = fmul float %865, %876
+  br label %ENDIF175
+
+ENDIF175:                                         ; preds = %ENDIF172, %IF176
+  %temp84.5 = phi float [ %848, %IF176 ], [ %temp84.4, %ENDIF172 ]
+  %temp85.5 = phi float [ %849, %IF176 ], [ %temp85.4, %ENDIF172 ]
+  %temp86.5 = phi float [ %850, %IF176 ], [ %temp86.4, %ENDIF172 ]
+  %temp87.5 = phi float [ %851, %IF176 ], [ %temp87.4, %ENDIF172 ]
+  %temp92.11 = phi float [ %877, %IF176 ], [ %temp92.10, %ENDIF172 ]
+  %temp93.5 = phi float [ %878, %IF176 ], [ %temp93.4, %ENDIF172 ]
+  %temp94.5 = phi float [ %879, %IF176 ], [ %temp94.4, %ENDIF172 ]
+  %880 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %881 = extractelement <4 x float> %880, i32 0
+  %882 = fcmp olt float %881, %179
+  %883 = sext i1 %882 to i32
+  %884 = bitcast i32 %883 to float
+  %885 = bitcast float %884 to i32
+  %886 = icmp ne i32 %885, 0
+  br i1 %886, label %IF179, label %ENDIF178
+
+IF179:                                            ; preds = %ENDIF175
+  %887 = fadd float %202, 1.000000e+00
+  %888 = fadd float %202, 1.000000e+00
+  %889 = fadd float %202, 1.000000e+00
+  %890 = insertelement <4 x float> undef, float %43, i32 0
+  %891 = insertelement <4 x float> %890, float %44, i32 1
+  %892 = insertelement <4 x float> %891, float %45, i32 2
+  %893 = insertelement <4 x float> %892, float 0.000000e+00, i32 3
+  %894 = insertelement <4 x float> undef, float %43, i32 0
+  %895 = insertelement <4 x float> %894, float %44, i32 1
+  %896 = insertelement <4 x float> %895, float %45, i32 2
+  %897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3
+  %898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897)
+  %899 = call float @llvm.AMDGPU.rsq(float %898)
+  %900 = fmul float %45, %899
+  %901 = call float @fabs(float %900)
+  %902 = fmul float %176, 0x3FECCCCCC0000000
+  %903 = fadd float %902, %901
+  %904 = fadd float %903, 0xBFEFAE1480000000
+  %905 = fmul float %904, 0xC043FFFE20000000
+  %906 = call float @llvm.AMDIL.clamp.(float %905, float 0.000000e+00, float 1.000000e+00)
+  %907 = fmul float 2.000000e+00, %906
+  %908 = fsub float -0.000000e+00, %907
+  %909 = fadd float 3.000000e+00, %908
+  %910 = fmul float %906, %909
+  %911 = fmul float %906, %910
+  %912 = call float @llvm.AMDGPU.lrp(float %911, float %temp84.5, float %887)
+  %913 = call float @llvm.AMDGPU.lrp(float %911, float %temp85.5, float %888)
+  %914 = call float @llvm.AMDGPU.lrp(float %911, float %temp86.5, float %889)
+  %915 = call float @llvm.AMDGPU.lrp(float %911, float %temp87.5, float 0.000000e+00)
+  %916 = fmul float %202, 5.000000e-01
+  %917 = fcmp uge float 0x3FE4CCCCC0000000, %916
+  %918 = select i1 %917, float 0x3FE4CCCCC0000000, float %916
+  %919 = fcmp uge float %918, 0x3FE3333340000000
+  %920 = select i1 %919, float 0x3FE3333340000000, float %918
+  %921 = call float @llvm.AMDGPU.lrp(float %920, float %912, float %temp84.5)
+  %922 = call float @llvm.AMDGPU.lrp(float %920, float %913, float %temp85.5)
+  %923 = call float @llvm.AMDGPU.lrp(float %920, float %914, float %temp86.5)
+  %924 = call float @llvm.AMDGPU.lrp(float %920, float %915, float %temp87.5)
+  %925 = insertelement <4 x float> undef, float %329, i32 0
+  %926 = insertelement <4 x float> %925, float %330, i32 1
+  %927 = insertelement <4 x float> %926, float %331, i32 2
+  %928 = insertelement <4 x float> %927, float 0.000000e+00, i32 3
+  %929 = insertelement <4 x float> undef, float %63, i32 0
+  %930 = insertelement <4 x float> %929, float %65, i32 1
+  %931 = insertelement <4 x float> %930, float %67, i32 2
+  %932 = insertelement <4 x float> %931, float 0.000000e+00, i32 3
+  %933 = call float @llvm.AMDGPU.dp4(<4 x float> %928, <4 x float> %932)
+  %934 = fcmp uge float 0x3FE99999A0000000, %933
+  %935 = select i1 %934, float 0x3FE99999A0000000, float %933
+  %936 = fmul float %8, %935
+  %937 = fmul float %13, %935
+  %938 = fmul float %18, %935
+  %939 = insertelement <4 x float> undef, float %34, i32 0
+  %940 = insertelement <4 x float> %939, float %35, i32 1
+  %941 = insertelement <4 x float> %940, float %36, i32 2
+  %942 = insertelement <4 x float> %941, float 0.000000e+00, i32 3
+  %943 = insertelement <4 x float> undef, float %63, i32 0
+  %944 = insertelement <4 x float> %943, float %65, i32 1
+  %945 = insertelement <4 x float> %944, float %67, i32 2
+  %946 = insertelement <4 x float> %945, float 0.000000e+00, i32 3
+  %947 = call float @llvm.AMDGPU.dp4(<4 x float> %942, <4 x float> %946)
+  %948 = fcmp uge float 0x3FECCCCCC0000000, %947
+  %949 = select i1 %948, float 0x3FECCCCCC0000000, float %947
+  %950 = fmul float %936, %949
+  %951 = fmul float %937, %949
+  %952 = fmul float %938, %949
+  br label %ENDIF178
+
+ENDIF178:                                         ; preds = %ENDIF175, %IF179
+  %temp84.6 = phi float [ %921, %IF179 ], [ %temp84.5, %ENDIF175 ]
+  %temp85.6 = phi float [ %922, %IF179 ], [ %temp85.5, %ENDIF175 ]
+  %temp86.6 = phi float [ %923, %IF179 ], [ %temp86.5, %ENDIF175 ]
+  %temp87.6 = phi float [ %924, %IF179 ], [ %temp87.5, %ENDIF175 ]
+  %temp92.12 = phi float [ %950, %IF179 ], [ %temp92.11, %ENDIF175 ]
+  %temp93.6 = phi float [ %951, %IF179 ], [ %temp93.5, %ENDIF175 ]
+  %temp94.6 = phi float [ %952, %IF179 ], [ %temp94.5, %ENDIF175 ]
+  %953 = fmul float %55, %temp92.12
+  %954 = fmul float %57, %temp93.6
+  %955 = fmul float %59, %temp94.6
+  %956 = fmul float %61, 0.000000e+00
+  %957 = fmul float %temp84.6, %953
+  %958 = fmul float %temp85.6, %954
+  %959 = fmul float %temp86.6, %955
+  %960 = fmul float %temp87.6, %956
+  %961 = fmul float %2, -2.000000e+00
+  %962 = fadd float %961, 1.000000e+00
+  %963 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 23)
+  %964 = extractelement <4 x float> %963, i32 2
+  %965 = fsub float -0.000000e+00, %964
+  %966 = fadd float %962, %965
+  %967 = fdiv float 1.000000e+00, %966
+  %968 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 24)
+  %969 = extractelement <4 x float> %968, i32 2
+  %970 = fmul float %969, %967
+  %971 = fsub float -0.000000e+00, %53
+  %972 = fmul float %971, %53
+  %973 = fmul float %972, %970
+  %974 = fmul float %973, %970
+  %975 = fmul float %974, 0x3FF7154760000000
+  %976 = call float @llvm.AMDIL.exp.(float %975)
+  %977 = fcmp oeq float %53, 1.000000e+00
+  %978 = sext i1 %977 to i32
+  %979 = bitcast i32 %978 to float
+  %980 = bitcast float %979 to i32
+  %981 = icmp ne i32 %980, 0
+  %.184 = select i1 %981, float 1.000000e+00, float %976
+  %982 = call float @llvm.AMDGPU.lrp(float %.184, float %957, float %47)
+  %983 = call float @llvm.AMDGPU.lrp(float %.184, float %958, float %49)
+  %984 = call float @llvm.AMDGPU.lrp(float %.184, float %959, float %51)
+  %985 = insertelement <4 x float> undef, float %982, i32 0
+  %986 = insertelement <4 x float> %985, float %983, i32 1
+  %987 = insertelement <4 x float> %986, float %984, i32 2
+  %988 = insertelement <4 x float> %987, float %960, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %988, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq(float) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
+
+; Function Attrs: readonly
+declare float @fabs(float) #2
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.exp.(float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.lrp(float, float, float) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.clamp.(float, float, float) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { readnone }
+attributes #2 = { readonly }
diff --git a/test/CodeGen/R600/bitcast.ll b/test/CodeGen/R600/bitcast.ll
new file mode 100644
index 0000000..bccc416
--- /dev/null
+++ b/test/CodeGen/R600/bitcast.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; This test just checks that the compiler doesn't crash.
+; CHECK-LABEL: @v32i8_to_v8i32
+; CHECK: S_ENDPGM
+
+define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
+entry:
+  %1 = load <32 x i8> addrspace(2)* %0
+  %2 = bitcast <32 x i8> %1 to <8 x i32>
+  %3 = extractelement <8 x i32> %2, i32 1
+  %4 = icmp ne i32 %3, 0
+  %5 = select i1 %4, float 0.0, float 1.0
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
+  ret void
+}
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+
diff --git a/test/CodeGen/R600/build_vector.ll b/test/CodeGen/R600/build_vector.ll
index 9b738a2..8179de1 100644
--- a/test/CodeGen/R600/build_vector.ll
+++ b/test/CodeGen/R600/build_vector.ll
@@ -1,14 +1,14 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK: @build_vector2
 ; R600-CHECK: MOV
 ; R600-CHECK: MOV
 ; R600-CHECK-NOT: MOV
 ; SI-CHECK: @build_vector2
-; SI-CHECK-DAG: V_MOV_B32_e32 [[X:VGPR[0-9]]], 5
-; SI-CHECK-DAG: V_MOV_B32_e32 [[Y:VGPR[0-9]]], 6
-; SI-CHECK: BUFFER_STORE_DWORDX2 [[X]]_[[Y]]
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[X:[0-9]]], 5
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[Y:[0-9]]], 6
+; SI-CHECK: BUFFER_STORE_DWORDX2 v{{\[}}[[X]]:[[Y]]{{\]}}
 define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
   store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
@@ -22,11 +22,11 @@ entry:
 ; R600-CHECK: MOV
 ; R600-CHECK-NOT: MOV
 ; SI-CHECK: @build_vector4
-; SI-CHECK-DAG: V_MOV_B32_e32 [[X:VGPR[0-9]]], 5
-; SI-CHECK-DAG: V_MOV_B32_e32 [[Y:VGPR[0-9]]], 6
-; SI-CHECK-DAG: V_MOV_B32_e32 [[Z:VGPR[0-9]]], 7
-; SI-CHECK-DAG: V_MOV_B32_e32 [[W:VGPR[0-9]]], 8
-; SI-CHECK: BUFFER_STORE_DWORDX4 [[X]]_[[Y]]_[[Z]]_[[W]]
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[X:[0-9]]], 5
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[Y:[0-9]]], 6
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[Z:[0-9]]], 7
+; SI-CHECK-DAG: V_MOV_B32_e32 v[[W:[0-9]]], 8
+; SI-CHECK: BUFFER_STORE_DWORDX4 v{{\[}}[[X]]:[[W]]{{\]}}
 define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
diff --git a/test/CodeGen/R600/combine_vloads.ll b/test/CodeGen/R600/combine_vloads.ll
new file mode 100644
index 0000000..f8ec712
--- /dev/null
+++ b/test/CodeGen/R600/combine_vloads.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+;
+; kernel void combine_vloads(global char8* src, global char8* result) {
+;   for (int i = 0; i < 1024; ++i)
+;     result[i] = src[0] + src[1] + src[2] + src[3];
+; }
+;
+
+
+; 128-bit loads instead of many 8-bit
+; EG-LABEL: @combine_vloads:
+; EG: VTX_READ_128
+; EG: VTX_READ_128
+define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+entry:
+  br label %for.body
+
+for.exit:                                         ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ]
+  %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)*
+  %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)*
+  %vecload2 = load <8 x i32> addrspace(1)* %0, align 32
+  %1 = bitcast <8 x i32> %vecload2 to <32 x i8>
+  %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp9 = add nsw <8 x i8> %tmp5, %tmp8
+  %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %tmp13 = add nsw <8 x i8> %tmp9, %tmp12
+  %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %tmp17 = add nsw <8 x i8> %tmp13, %tmp16
+  %scevgep = getelementptr <8 x i8> addrspace(1)* %result, i32 %i.01
+  %2 = bitcast <8 x i8> %tmp17 to <2 x i32>
+  %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)*
+  store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8
+  %tmp19 = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %tmp19, 1024
+  br i1 %exitcond, label %for.exit, label %for.body
+}
diff --git a/test/CodeGen/R600/complex-folding.ll b/test/CodeGen/R600/complex-folding.ll
new file mode 100644
index 0000000..99f0d99
--- /dev/null
+++ b/test/CodeGen/R600/complex-folding.ll
@@ -0,0 +1,19 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @main
+; CHECK-NOT: MOV
+define void @main(<4 x float> inreg %reg0) #0 {
+entry:
+  %0 = extractelement <4 x float> %reg0, i32 0
+  %1 = call float @fabs(float %0)
+  %2 = fptoui float %1 to i32
+  %3 = bitcast i32 %2 to float
+  %4 = insertelement <4 x float> undef, float %3, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %4, i32 0, i32 0)
+  ret void
+}
+
+declare float @fabs(float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/elf.ll b/test/CodeGen/R600/elf.ll
index f460f13..9385150 100644
--- a/test/CodeGen/R600/elf.ll
+++ b/test/CodeGen/R600/elf.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=r600 -mcpu=SI -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI -o - | FileCheck --check-prefix=CONFIG-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s - | FileCheck --check-prefix=ELF-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG-CHECK %s
 
 ; ELF-CHECK: Format: ELF32
 ; ELF-CHECK: Name: .AMDGPU.config
diff --git a/test/CodeGen/R600/extload.ll b/test/CodeGen/R600/extload.ll
new file mode 100644
index 0000000..aa660b3
--- /dev/null
+++ b/test/CodeGen/R600/extload.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+; EG-LABEL: @anyext_load_i8:
+; EG: AND_INT
+; EG-NEXT: 255
+define void @anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
+  %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
+  %load = load i32 addrspace(1)* %cast, align 1
+  %x = bitcast i32 %load to <4 x i8>
+  %castOut = bitcast i8 addrspace(1)* %out to <4 x i8> addrspace(1)*
+  store <4 x i8> %x, <4 x i8> addrspace(1)* %castOut, align 1
+  ret void
+}
+
+; EG-LABEL: @anyext_load_i16:
+; EG: AND_INT
+; EG: LSHL
+; EG: 65535
+define void @anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
+  %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
+  %load = load i32 addrspace(1)* %cast, align 1
+  %x = bitcast i32 %load to <2 x i16>
+  %castOut = bitcast i16 addrspace(1)* %out to <2 x i16> addrspace(1)*
+  store <2 x i16> %x, <2 x i16> addrspace(1)* %castOut, align 1
+  ret void
+}
+
+; EG-LABEL: @anyext_load_lds_i8:
+; EG: AND_INT
+; EG-NEXT: 255
+define void @anyext_load_lds_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
+  %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
+  %load = load i32 addrspace(3)* %cast, align 1
+  %x = bitcast i32 %load to <4 x i8>
+  %castOut = bitcast i8 addrspace(3)* %out to <4 x i8> addrspace(3)*
+  store <4 x i8> %x, <4 x i8> addrspace(3)* %castOut, align 1
+  ret void
+}
+
+; EG-LABEL: @anyext_load_lds_i16:
+; EG: AND_INT
+; EG: LSHL
+; EG: 65535
+define void @anyext_load_lds_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
+  %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
+  %load = load i32 addrspace(3)* %cast, align 1
+  %x = bitcast i32 %load to <2 x i16>
+  %castOut = bitcast i16 addrspace(3)* %out to <2 x i16> addrspace(3)*
+  store <2 x i16> %x, <2 x i16> addrspace(3)* %castOut, align 1
+  ret void
+}
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
index 78ffd57..a5f5df9 100644
--- a/test/CodeGen/R600/fabs.ll
+++ b/test/CodeGen/R600/fabs.ll
@@ -1,15 +1,15 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; DAGCombiner will transform:
 ; (fabs (f32 bitcast (i32 a))) => (f32 bitcast (and (i32 a), 0x7FFFFFFF))
 ; unless isFabsFree returns true
 
-; R600-CHECK: @fabs_free
+; R600-CHECK-LABEL: @fabs_free
 ; R600-CHECK-NOT: AND
 ; R600-CHECK: |PV.{{[XYZW]}}|
-; SI-CHECK: @fabs_free
-; SI-CHECK: V_ADD_F32_e64 VGPR{{[0-9]}}, SGPR{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK-LABEL: @fabs_free
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
 
 define void @fabs_free(float addrspace(1)* %out, i32 %in) {
 entry:
@@ -19,4 +19,36 @@ entry:
   ret void
 }
 
+; R600-CHECK-LABEL: @fabs_v2
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; SI-CHECK-LABEL: @fabs_v2
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+define void @fabs_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @fabs_v4
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; R600-CHECK: |{{(PV|T[0-9])\.[XYZW]}}|
+; SI-CHECK-LABEL: @fabs_v4
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 1, 0, 0, 0
+define void @fabs_v4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
 declare float @fabs(float ) readnone
+declare <2 x float> @llvm.fabs.v2f32(<2 x float> ) readnone
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> ) readnone
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
index 97dbe44..f467bb7 100644
--- a/test/CodeGen/R600/fadd.ll
+++ b/test/CodeGen/R600/fadd.ll
@@ -1,23 +1,23 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @fadd_f32
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @fadd_f32() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
-   %r2 = fadd float %r0, %r1
-   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+; R600-CHECK: @fadd_f32
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI-CHECK: @fadd_f32
+; SI-CHECK: V_ADD_F32
+define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+   %0 = fadd float %a, %b
+   store float %0, float addrspace(1)* %out
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
-; CHECK: @fadd_v2f32
-; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; CHECK-DAG: ADD * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; R600-CHECK: @fadd_v2f32
+; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; SI-CHECK: @fadd_v2f32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
 define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fadd <2 x float> %a, %b
@@ -25,12 +25,16 @@ entry:
   ret void
 }
 
-; CHECK: @fadd_v4f32
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
+; R600-CHECK: @fadd_v4f32
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI-CHECK: @fadd_v4f32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
+; SI-CHECK: V_ADD_F32
 define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fadd64.ll b/test/CodeGen/R600/fadd64.ll
index 130302f..48cd3cf 100644
--- a/test/CodeGen/R600/fadd64.ll
+++ b/test/CodeGen/R600/fadd64.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fadd_f64
-; CHECK: V_ADD_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_ADD_F64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
 
 define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
index 7373a21..1d4e323 100644
--- a/test/CodeGen/R600/fcmp-cnd.ll
+++ b/test/CodeGen/R600/fcmp-cnd.ll
@@ -2,7 +2,7 @@
 
 ;Not checking arguments 2 and 3 to CNDE, because they may change between
 ;registers and literal.x depending on what the optimizer does.
-;CHECK: CNDE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: CNDE  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll
index dc3a779..c76a758 100644
--- a/test/CodeGen/R600/fcmp.ll
+++ b/test/CodeGen/R600/fcmp.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ; CHECK: @fcmp_sext
-; CHECK: SETE_DX10 * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/R600/fcmp64.ll b/test/CodeGen/R600/fcmp64.ll
index 8f2513b..bcc7a8c 100644
--- a/test/CodeGen/R600/fcmp64.ll
+++ b/test/CodeGen/R600/fcmp64.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @flt_f64
-; CHECK: V_CMP_LT_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_LT_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @flt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
@@ -14,7 +14,7 @@ define void @flt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK: @fle_f64
-; CHECK: V_CMP_LE_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_LE_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @fle_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
@@ -27,7 +27,7 @@ define void @fle_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK: @fgt_f64
-; CHECK: V_CMP_GT_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_GT_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @fgt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
@@ -40,7 +40,7 @@ define void @fgt_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK: @fge_f64
-; CHECK: V_CMP_GE_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_GE_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @fge_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
@@ -53,7 +53,7 @@ define void @fge_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK: @fne_f64
-; CHECK: V_CMP_NEQ_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_NEQ_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
@@ -66,7 +66,7 @@ define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 }
 
 ; CHECK: @feq_f64
-; CHECK: V_CMP_EQ_F64_e64 {{SGPR[0-9]+_SGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_CMP_EQ_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
 
 define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/fconst64.ll b/test/CodeGen/R600/fconst64.ll
index 2402a9c..5c5ee7e 100644
--- a/test/CodeGen/R600/fconst64.ll
+++ b/test/CodeGen/R600/fconst64.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fconst_f64
-; CHECK: V_MOV_B32_e32 {{VGPR[0-9]+}}, 0.000000e+00
-; CHECK-NEXT: V_MOV_B32_e32 {{VGPR[0-9]+}}, 2.312500e+00
+; CHECK: V_MOV_B32_e32 {{v[0-9]+}}, 0.000000e+00
+; CHECK-NEXT: V_MOV_B32_e32 {{v[0-9]+}}, 2.312500e+00
 
 define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r1 = load double addrspace(1)* %in
diff --git a/test/CodeGen/R600/fdiv.ll b/test/CodeGen/R600/fdiv.ll
index 6798eac..3d21524 100644
--- a/test/CodeGen/R600/fdiv.ll
+++ b/test/CodeGen/R600/fdiv.ll
@@ -1,14 +1,20 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; These tests check that fdiv is expanded correctly and also test that the
 ; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
 ; instruction groups.
 
-; CHECK: @fdiv_v2f32
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
-; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
-; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+; R600-CHECK: @fdiv_v2f32
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
+; SI-CHECK: @fdiv_v2f32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
 define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fdiv <2 x float> %a, %b
@@ -16,16 +22,24 @@ entry:
   ret void
 }
 
-; CHECK: @fdiv_v4f32
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; CHECK-DAG: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-; CHECK-DAG: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
-
+; R600-CHECK: @fdiv_v4f32
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; R600-CHECK-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
+; SI-CHECK: @fdiv_v4f32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
+; SI-CHECK-DAG: V_RCP_F32
+; SI-CHECK-DAG: V_MUL_F32
 define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fdiv64.ll b/test/CodeGen/R600/fdiv64.ll
index 76c5ca3..79b5c8b 100644
--- a/test/CodeGen/R600/fdiv64.ll
+++ b/test/CodeGen/R600/fdiv64.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fdiv_f64
-; CHECK: V_RCP_F64_e32 {{VGPR[0-9]+_VGPR[0-9]+}}
-; CHECK: V_MUL_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_RCP_F64_e32 {{v\[[0-9]+:[0-9]+\]}}
+; CHECK: V_MUL_F64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 
 define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll
index 877d69a..67e86c4 100644
--- a/test/CodeGen/R600/floor.ll
+++ b/test/CodeGen/R600/floor.ll
@@ -2,15 +2,15 @@
 
 ;CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @floor(float %r0)
-   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 declare float @floor(float) readonly
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/fma.ll b/test/CodeGen/R600/fma.ll
new file mode 100644
index 0000000..51e9d29
--- /dev/null
+++ b/test/CodeGen/R600/fma.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; CHECK: @fma_f32
+; CHECK: V_FMA_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                     float addrspace(1)* %in2, float addrspace(1)* %in3) {
+   %r0 = load float addrspace(1)* %in1
+   %r1 = load float addrspace(1)* %in2
+   %r2 = load float addrspace(1)* %in3
+   %r3 = tail call float @llvm.fma.f32(float %r0, float %r1, float %r2)
+   store float %r3, float addrspace(1)* %out
+   ret void
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+; CHECK: @fma_f64
+; CHECK: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+
+define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                     double addrspace(1)* %in2, double addrspace(1)* %in3) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = load double addrspace(1)* %in3
+   %r3 = tail call double @llvm.fma.f64(double %r0, double %r1, double %r2)
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+declare double @llvm.fma.f64(double, double, double)
diff --git a/test/CodeGen/R600/fmad.ll b/test/CodeGen/R600/fmad.ll
index 75e65d8..935e351 100644
--- a/test/CodeGen/R600/fmad.ll
+++ b/test/CodeGen/R600/fmad.ll
@@ -2,18 +2,18 @@
 
 ;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
-   %r2 = call float @llvm.R600.load.input(i32 2)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = extractelement <4 x float> %reg0, i32 2
    %r3 = fmul float %r0, %r1
-	%r4 = fadd float %r3, %r2
-   call void @llvm.AMDGPU.store.output(float %r4, i32 0)
+   %r4 = fadd float %r3, %r2
+   %vec = insertelement <4 x float> undef, float %r4, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 declare float @fabs(float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/fmax.ll b/test/CodeGen/R600/fmax.ll
index 8b704e5..d7127f4 100644
--- a/test/CodeGen/R600/fmax.ll
+++ b/test/CodeGen/R600/fmax.ll
@@ -2,15 +2,16 @@
 
 ;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
-   %r2 = fcmp uge float %r0, %r1
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
+   %r2 = fcmp oge float %r0, %r1
    %r3 = select i1 %r2, float %r0, float %r1
-   call void @llvm.AMDGPU.store.output(float %r3, i32 0)
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare void @llvm.AMDGPU.store.output(float, i32)
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/fmin.ll b/test/CodeGen/R600/fmin.ll
index 5e34b7c..defa8c0 100644
--- a/test/CodeGen/R600/fmin.ll
+++ b/test/CodeGen/R600/fmin.ll
@@ -2,15 +2,16 @@
 
 ;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp uge float %r0, %r1
    %r3 = select i1 %r2, float %r1, float %r0
-   call void @llvm.AMDGPU.store.output(float %r3, i32 0)
+   %vec = insertelement <4 x float> undef, float %r3, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare void @llvm.AMDGPU.store.output(float, i32)
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
index 6ef3a11..2a7825f 100644
--- a/test/CodeGen/R600/fmul.ll
+++ b/test/CodeGen/R600/fmul.ll
@@ -1,23 +1,27 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @fmul_f32
-; CHECK: MUL_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @fmul_f32() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
-   %r2 = fmul float %r0, %r1
-   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
-   ret void
+; R600-CHECK: @fmul_f32
+; R600-CHECK: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
+; SI-CHECK: @fmul_f32
+; SI-CHECK: V_MUL_F32
+define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fmul float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
 }
 
 declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
-; CHECK: @fmul_v2f32
-; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
-; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW]}}
+; R600-CHECK: @fmul_v2f32
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
+; SI-CHECK: @fmul_v2f32
+; SI-CHECK: V_MUL_F32
+; SI-CHECK: V_MUL_F32
 define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fmul <2 x float> %a, %b
@@ -25,12 +29,16 @@ entry:
   ret void
 }
 
-; CHECK: @fmul_v4f32
-; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
+; R600-CHECK: @fmul_v4f32
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI-CHECK: @fmul_v4f32
+; SI-CHECK: V_MUL_F32
+; SI-CHECK: V_MUL_F32
+; SI-CHECK: V_MUL_F32
+; SI-CHECK: V_MUL_F32
 define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fmul.v4f32.ll b/test/CodeGen/R600/fmul.v4f32.ll
deleted file mode 100644
index 74a58f7..0000000
--- a/test/CodeGen/R600/fmul.v4f32.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;CHECK: MUL_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
-  %a = load <4 x float> addrspace(1) * %in
-  %b = load <4 x float> addrspace(1) * %b_ptr
-  %result = fmul <4 x float> %a, %b
-  store <4 x float> %result, <4 x float> addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/R600/fmul64.ll b/test/CodeGen/R600/fmul64.ll
index 8a57d4a..7c7bf04 100644
--- a/test/CodeGen/R600/fmul64.ll
+++ b/test/CodeGen/R600/fmul64.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fmul_f64
-; CHECK: V_MUL_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_MUL_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 
 define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/fmuladd.ll b/test/CodeGen/R600/fmuladd.ll
new file mode 100644
index 0000000..48944f6
--- /dev/null
+++ b/test/CodeGen/R600/fmuladd.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; CHECK: @fmuladd_f32
+; CHECK: V_MAD_F32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
+
+define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+                         float addrspace(1)* %in2, float addrspace(1)* %in3) {
+   %r0 = load float addrspace(1)* %in1
+   %r1 = load float addrspace(1)* %in2
+   %r2 = load float addrspace(1)* %in3
+   %r3 = tail call float @llvm.fmuladd.f32(float %r0, float %r1, float %r2)
+   store float %r3, float addrspace(1)* %out
+   ret void
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
+; CHECK: @fmuladd_f64
+; CHECK: V_FMA_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
+
+define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+                         double addrspace(1)* %in2, double addrspace(1)* %in3) {
+   %r0 = load double addrspace(1)* %in1
+   %r1 = load double addrspace(1)* %in2
+   %r2 = load double addrspace(1)* %in3
+   %r3 = tail call double @llvm.fmuladd.f64(double %r0, double %r1, double %r2)
+   store double %r3, double addrspace(1)* %out
+   ret void
+}
+
+declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/test/CodeGen/R600/fneg.ll b/test/CodeGen/R600/fneg.ll
index 799db0c..9446aa8 100644
--- a/test/CodeGen/R600/fneg.ll
+++ b/test/CodeGen/R600/fneg.ll
@@ -1,8 +1,23 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @fneg_v2
-; CHECK: -PV
-; CHECK: -PV
+; R600-CHECK-LABEL: @fneg
+; R600-CHECK: -PV
+; SI-CHECK-LABEL: @fneg
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+define void @fneg(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fsub float -0.000000e+00, %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @fneg_v2
+; R600-CHECK: -PV
+; R600-CHECK: -PV
+; SI-CHECK-LABEL: @fneg_v2
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
 define void @fneg_v2(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
 entry:
   %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
@@ -10,11 +25,16 @@ entry:
   ret void
 }
 
-; CHECK: @fneg_v4
-; CHECK: -PV
-; CHECK: -PV
-; CHECK: -PV
-; CHECK: -PV
+; R600-CHECK-LABEL: @fneg_v4
+; R600-CHECK: -PV
+; R600-CHECK: -T
+; R600-CHECK: -PV
+; R600-CHECK: -PV
+; SI-CHECK-LABEL: @fneg_v4
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
+; SI-CHECK: V_ADD_F32_e64 v{{[0-9]}}, s{{[0-9]}}, 0, 0, 0, 0, 1
 define void @fneg_v4(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
 entry:
   %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
@@ -26,9 +46,12 @@ entry:
 ; (fneg (f32 bitcast (i32 a))) => (f32 bitcast (xor (i32 a), 0x80000000))
 ; unless the target returns true for isNegFree()
 
-; CHECK-NOT: XOR
-; CHECK: -KC0[2].Z
-
+; R600-CHECK-LABEL: @fneg_free
+; R600-CHECK-NOT: XOR
+; R600-CHECK: -KC0[2].Z
+; SI-CHECK-LABEL: @fneg_free
+; XXX: We could use V_ADD_F32_e64 with the negate bit here instead.
+; SI-CHECK: V_SUB_F32_e64 v{{[0-9]}}, 0.000000e+00, s{{[0-9]}}, 0, 0, 0, 0
 define void @fneg_free(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = bitcast i32 %in to float
diff --git a/test/CodeGen/R600/fp64_to_sint.ll b/test/CodeGen/R600/fp64_to_sint.ll
new file mode 100644
index 0000000..185e21c
--- /dev/null
+++ b/test/CodeGen/R600/fp64_to_sint.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+
+; CHECK: @fp64_to_sint
+; CHECK: V_CVT_I32_F64_e32
+define void @fp64_to_sint(i32 addrspace(1)* %out, double %in) {
+  %result = fptosi double %in to i32
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fp_to_sint.ll b/test/CodeGen/R600/fp_to_sint.ll
index 6471270..8302b4f 100644
--- a/test/CodeGen/R600/fp_to_sint.ll
+++ b/test/CodeGen/R600/fp_to_sint.ll
@@ -1,9 +1,9 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK: @fp_to_sint_v2i32
-; R600-CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI-CHECK: @fp_to_sint_v2i32
 ; SI-CHECK: V_CVT_I32_F32_e32
 ; SI-CHECK: V_CVT_I32_F32_e32
@@ -14,10 +14,10 @@ define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
 }
 
 ; R600-CHECK: @fp_to_sint_v4i32
-; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; R600-CHECK: FLT_TO_INT {{[* ]*}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI-CHECK: @fp_to_sint_v4i32
 ; SI-CHECK: V_CVT_I32_F32_e32
 ; SI-CHECK: V_CVT_I32_F32_e32
diff --git a/test/CodeGen/R600/fp_to_uint.ll b/test/CodeGen/R600/fp_to_uint.ll
index 2a365f9..77db43b 100644
--- a/test/CodeGen/R600/fp_to_uint.ll
+++ b/test/CodeGen/R600/fp_to_uint.ll
@@ -1,8 +1,12 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @fp_to_uint_v2i32
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: @fp_to_uint_v2i32
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; SI-CHECK: @fp_to_uint_v2i32
+; SI-CHECK: V_CVT_U32_F32_e32
+; SI-CHECK: V_CVT_U32_F32_e32
 
 define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptoui <2 x float> %in to <2 x i32>
@@ -10,11 +14,16 @@ define void @fp_to_uint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   ret void
 }
 
-; CHECK: @fp_to_uint_v4i32
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
-; CHECK: FLT_TO_UINT * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: @fp_to_uint_v4i32
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; R600-CHECK: FLT_TO_UINT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
+; SI-CHECK: @fp_to_uint_v4i32
+; SI-CHECK: V_CVT_U32_F32_e32
+; SI-CHECK: V_CVT_U32_F32_e32
+; SI-CHECK: V_CVT_U32_F32_e32
+; SI-CHECK: V_CVT_U32_F32_e32
 
 define void @fp_to_uint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fpext.ll b/test/CodeGen/R600/fpext.ll
new file mode 100644
index 0000000..143ee79
--- /dev/null
+++ b/test/CodeGen/R600/fpext.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+
+; CHECK: @fpext
+; CHECK: V_CVT_F64_F32_e32
+define void @fpext(double addrspace(1)* %out, float %in) {
+  %result = fpext float %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fptrunc.ll b/test/CodeGen/R600/fptrunc.ll
new file mode 100644
index 0000000..20a8c00
--- /dev/null
+++ b/test/CodeGen/R600/fptrunc.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+
+; CHECK: @fptrunc
+; CHECK: V_CVT_F32_F64_e32
+define void @fptrunc(float addrspace(1)* %out, double %in) {
+  %result = fptrunc double %in to float
+  store float %result, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fsqrt.ll b/test/CodeGen/R600/fsqrt.ll
index 2613805..ae50b17 100644
--- a/test/CodeGen/R600/fsqrt.ll
+++ b/test/CodeGen/R600/fsqrt.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fsqrt_f32
-; CHECK: V_SQRT_F32_e32 {{VGPR[0-9]+, VGPR[0-9]+}}
+; CHECK: V_SQRT_F32_e32 {{v[0-9]+, v[0-9]+}}
 
 define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
    %r0 = load float addrspace(1)* %in
@@ -11,7 +11,7 @@ define void @fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 }
 
 ; CHECK: @fsqrt_f64
-; CHECK: V_SQRT_F64_e32 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}
+; CHECK: V_SQRT_F64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 
 define void @fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r0 = load double addrspace(1)* %in
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
index 0fc5860..4f74efb 100644
--- a/test/CodeGen/R600/fsub.ll
+++ b/test/CodeGen/R600/fsub.ll
@@ -1,23 +1,27 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; CHECK: @fsub_f32
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-
-define void @fsub_f32() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
-   %r2 = fsub float %r0, %r1
-   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
-   ret void
+; R600-CHECK: @fsub_f32
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
+; SI-CHECK: @fsub_f32
+; SI-CHECK: V_SUB_F32
+define void @fsub_f32(float addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fsub float %a, %b
+  store float %0, float addrspace(1)* %out
+  ret void
 }
 
 declare float @llvm.R600.load.input(i32) readnone
 
 declare void @llvm.AMDGPU.store.output(float, i32)
 
-; CHECK: @fsub_v2f32
-; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
-; CHECK-DAG: ADD * T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
+; R600-CHECK: @fsub_v2f32
+; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
+; R600-CHECK-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
+; SI-CHECK: @fsub_v2f32
+; SI-CHECK: V_SUB_F32
+; SI-CHECK: V_SUB_F32
 define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fsub <2 x float> %a, %b
@@ -25,11 +29,16 @@ entry:
   ret void
 }
 
-; CHECK: @fsub_v4f32
-; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
-; CHECK: ADD * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600-CHECK: @fsub_v4f32
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; R600-CHECK: ADD {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; SI-CHECK: @fsub_v4f32
+; SI-CHECK: V_SUB_F32
+; SI-CHECK: V_SUB_F32
+; SI-CHECK: V_SUB_F32
+; SI-CHECK: V_SUB_F32
 define void @fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float> addrspace(1) * %in
diff --git a/test/CodeGen/R600/fsub64.ll b/test/CodeGen/R600/fsub64.ll
index fa59dcc..1445a20 100644
--- a/test/CodeGen/R600/fsub64.ll
+++ b/test/CodeGen/R600/fsub64.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; CHECK: @fsub_f64
-; CHECK: V_ADD_F64 {{VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+, VGPR[0-9]+_VGPR[0-9]+}}, 0, 0, 0, 0, 2
+; CHECK: V_ADD_F64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}, 0, 0, 0, 0, 2
 
 define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
diff --git a/test/CodeGen/R600/gep-address-space.ll b/test/CodeGen/R600/gep-address-space.ll
new file mode 100644
index 0000000..4ea21dd
--- /dev/null
+++ b/test/CodeGen/R600/gep-address-space.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
+
+define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
+; CHECK-LABEL @use_gep_address_space:
+; CHECK: S_ADD_I32
+  %p = getelementptr [1024 x i32] addrspace(3)* %array, i16 0, i16 16
+  store i32 99, i32 addrspace(3)* %p
+  ret void
+}
+
+define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
+; CHECK-LABEL: @gep_as_vector_v4:
+; CHECK: S_ADD_I32
+; CHECK: S_ADD_I32
+; CHECK: S_ADD_I32
+; CHECK: S_ADD_I32
+  %p = getelementptr <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
+  %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
+  %p2 = extractelement <4 x i32 addrspace(3)*> %p, i32 2
+  %p3 = extractelement <4 x i32 addrspace(3)*> %p, i32 3
+  store i32 99, i32 addrspace(3)* %p0
+  store i32 99, i32 addrspace(3)* %p1
+  store i32 99, i32 addrspace(3)* %p2
+  store i32 99, i32 addrspace(3)* %p3
+  ret void
+}
+
+define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
+; CHECK-LABEL: @gep_as_vector_v2:
+; CHECK: S_ADD_I32
+; CHECK: S_ADD_I32
+  %p = getelementptr <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
+  %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
+  %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
+  store i32 99, i32 addrspace(3)* %p0
+  store i32 99, i32 addrspace(3)* %p1
+  ret void
+}
+
diff --git a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
index e3005fe..71705a6 100644
--- a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
+++ b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
@@ -3,7 +3,7 @@
 ;Test that a select with reversed True/False values is correctly lowered
 ;to a SETNE_INT.  There should only be one SETNE_INT instruction.
 
-;CHECK: SETNE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;CHECK-NOT: SETNE_INT
 
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/imm.ll b/test/CodeGen/R600/imm.ll
index 979efb0..b047315 100644
--- a/test/CodeGen/R600/imm.ll
+++ b/test/CodeGen/R600/imm.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ; Use a 64-bit value with lo bits that can be represented as an inline constant
 ; CHECK: @i64_imm_inline_lo
-; CHECK: S_MOV_B32 [[LO:SGPR[0-9]+]], 5
-; CHECK: V_MOV_B32_e32 [[LO_VGPR:VGPR[0-9]+]], [[LO]]
-; CHECK: BUFFER_STORE_DWORDX2 [[LO_VGPR]]_
+; CHECK: S_MOV_B32 [[LO:s[0-9]+]], 5
+; CHECK: V_MOV_B32_e32 v[[LO_VGPR:[0-9]+]], [[LO]]
+; CHECK: BUFFER_STORE_DWORDX2 v{{\[}}[[LO_VGPR]]:
 define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
 entry:
   store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
@@ -13,9 +13,9 @@ entry:
 
 ; Use a 64-bit value with hi bits that can be represented as an inline constant
 ; CHECK: @i64_imm_inline_hi
-; CHECK: S_MOV_B32 [[HI:SGPR[0-9]+]], 5
-; CHECK: V_MOV_B32_e32 [[HI_VGPR:VGPR[0-9]+]], [[HI]]
-; CHECK: BUFFER_STORE_DWORDX2 {{VGPR[0-9]+}}_[[HI_VGPR]]
+; CHECK: S_MOV_B32 [[HI:s[0-9]+]], 5
+; CHECK: V_MOV_B32_e32 v[[HI_VGPR:[0-9]+]], [[HI]]
+; CHECK: BUFFER_STORE_DWORDX2 v{{\[[0-9]+:}}[[HI_VGPR]]
 define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
 entry:
   store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
diff --git a/test/CodeGen/R600/indirect-addressing-si.ll b/test/CodeGen/R600/indirect-addressing-si.ll
index ba5de22..169d69b 100644
--- a/test/CodeGen/R600/indirect-addressing-si.ll
+++ b/test/CodeGen/R600/indirect-addressing-si.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
 
 ; Tests for indirect addressing on SI, which is implemented using dynamic
 ; indexing of vectors.
 
 ; CHECK: extract_w_offset
-; CHECK: S_MOV_B32 M0
+; CHECK: S_MOV_B32 m0
 ; CHECK-NEXT: V_MOVRELS_B32_e32
 define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
@@ -15,7 +15,7 @@ entry:
 }
 
 ; CHECK: extract_wo_offset
-; CHECK: S_MOV_B32 M0
+; CHECK: S_MOV_B32 m0
 ; CHECK-NEXT: V_MOVRELS_B32_e32
 define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
@@ -25,7 +25,7 @@ entry:
 }
 
 ; CHECK: insert_w_offset
-; CHECK: S_MOV_B32 M0
+; CHECK: S_MOV_B32 m0
 ; CHECK-NEXT: V_MOVRELD_B32_e32
 define void @insert_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
@@ -37,7 +37,7 @@ entry:
 }
 
 ; CHECK: insert_wo_offset
-; CHECK: S_MOV_B32 M0
+; CHECK: S_MOV_B32 m0
 ; CHECK-NEXT: V_MOVRELD_B32_e32
 define void @insert_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
diff --git a/test/CodeGen/R600/insert_vector_elt.ll b/test/CodeGen/R600/insert_vector_elt.ll
new file mode 100644
index 0000000..05aecce
--- /dev/null
+++ b/test/CodeGen/R600/insert_vector_elt.ll
@@ -0,0 +1,16 @@
+; XFAIL: *
+; RUN: llc < %s -march=r600 -mcpu=redwood -o %t
+
+define void @var_insert(<4 x i32> addrspace(1)* %out, <4 x i32> %x, i32 %val, i32 %idx) nounwind  {
+entry:
+  %tmp3 = insertelement <4 x i32> %x, i32 %val, i32 %idx		; <<4 x i32>> [#uses=1]
+  store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+define void @var_extract(i32 addrspace(1)* %out, <4 x i32> %x, i32 %idx) nounwind  {
+entry:
+  %tmp3 = extractelement <4 x i32> %x, i32 %idx		; <<i32>> [#uses=1]
+  store i32 %tmp3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/jump-address.ll b/test/CodeGen/R600/jump-address.ll
index 26c298b..ae9c8bb 100644
--- a/test/CodeGen/R600/jump-address.ll
+++ b/test/CodeGen/R600/jump-address.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
-; CHECK: JUMP @5
+; CHECK: JUMP @3
 ; CHECK: EXPORT
 ; CHECK-NOT: EXPORT
 
diff --git a/test/CodeGen/R600/kcache-fold.ll b/test/CodeGen/R600/kcache-fold.ll
index 3d70e4b..0baa3cd 100644
--- a/test/CodeGen/R600/kcache-fold.ll
+++ b/test/CodeGen/R600/kcache-fold.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ; CHECK: @main1
-; CHECK: MOV T{{[0-9]+\.[XYZW], KC0}}
+; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
 define void @main1() {
 main_body:
   %0 = load <4 x float> addrspace(8)* null
@@ -10,7 +10,7 @@ main_body:
   %3 = extractelement <4 x float> %2, i32 0
   %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %5 = extractelement <4 x float> %4, i32 0
-  %6 = fcmp ult float %1, 0.000000e+00
+  %6 = fcmp ogt float %1, 0.000000e+00
   %7 = select i1 %6, float %3, float %5
   %8 = load <4 x float> addrspace(8)* null
   %9 = extractelement <4 x float> %8, i32 1
@@ -18,7 +18,7 @@ main_body:
   %11 = extractelement <4 x float> %10, i32 1
   %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %13 = extractelement <4 x float> %12, i32 1
-  %14 = fcmp ult float %9, 0.000000e+00
+  %14 = fcmp ogt float %9, 0.000000e+00
   %15 = select i1 %14, float %11, float %13
   %16 = load <4 x float> addrspace(8)* null
   %17 = extractelement <4 x float> %16, i32 2
@@ -26,7 +26,7 @@ main_body:
   %19 = extractelement <4 x float> %18, i32 2
   %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %21 = extractelement <4 x float> %20, i32 2
-  %22 = fcmp ult float %17, 0.000000e+00
+  %22 = fcmp ogt float %17, 0.000000e+00
   %23 = select i1 %22, float %19, float %21
   %24 = load <4 x float> addrspace(8)* null
   %25 = extractelement <4 x float> %24, i32 3
@@ -34,7 +34,7 @@ main_body:
   %27 = extractelement <4 x float> %26, i32 3
   %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %29 = extractelement <4 x float> %28, i32 3
-  %30 = fcmp ult float %25, 0.000000e+00
+  %30 = fcmp ogt float %25, 0.000000e+00
   %31 = select i1 %30, float %27, float %29
   %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
   %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
@@ -58,7 +58,7 @@ main_body:
   %3 = extractelement <4 x float> %2, i32 0
   %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %5 = extractelement <4 x float> %4, i32 1
-  %6 = fcmp ult float %1, 0.000000e+00
+  %6 = fcmp ogt float %1, 0.000000e+00
   %7 = select i1 %6, float %3, float %5
   %8 = load <4 x float> addrspace(8)* null
   %9 = extractelement <4 x float> %8, i32 1
@@ -66,7 +66,7 @@ main_body:
   %11 = extractelement <4 x float> %10, i32 0
   %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %13 = extractelement <4 x float> %12, i32 1
-  %14 = fcmp ult float %9, 0.000000e+00
+  %14 = fcmp ogt float %9, 0.000000e+00
   %15 = select i1 %14, float %11, float %13
   %16 = load <4 x float> addrspace(8)* null
   %17 = extractelement <4 x float> %16, i32 2
@@ -74,7 +74,7 @@ main_body:
   %19 = extractelement <4 x float> %18, i32 3
   %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %21 = extractelement <4 x float> %20, i32 2
-  %22 = fcmp ult float %17, 0.000000e+00
+  %22 = fcmp ogt float %17, 0.000000e+00
   %23 = select i1 %22, float %19, float %21
   %24 = load <4 x float> addrspace(8)* null
   %25 = extractelement <4 x float> %24, i32 3
@@ -82,7 +82,7 @@ main_body:
   %27 = extractelement <4 x float> %26, i32 3
   %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %29 = extractelement <4 x float> %28, i32 2
-  %30 = fcmp ult float %25, 0.000000e+00
+  %30 = fcmp ogt float %25, 0.000000e+00
   %31 = select i1 %30, float %27, float %29
   %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
   %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
diff --git a/test/CodeGen/R600/kernel-args.ll b/test/CodeGen/R600/kernel-args.ll
new file mode 100644
index 0000000..247e316
--- /dev/null
+++ b/test/CodeGen/R600/kernel-args.ll
@@ -0,0 +1,455 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+
+; EG-CHECK-LABEL: @i8_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i8_arg
+; SI-CHECK: BUFFER_LOAD_UBYTE
+
+define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i8_zext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i8_zext_arg
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+
+define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i8_sext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i8_sext_arg
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+
+define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+entry:
+  %0 = sext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i16_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i16_arg
+; SI-CHECK: BUFFER_LOAD_USHORT
+
+define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i16_zext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i16_zext_arg
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+
+define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i16_sext_arg
+; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i16_sext_arg
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+
+define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+entry:
+  %0 = sext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @i32_arg
+; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @i32_arg
+; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
+entry:
+  store i32 %in, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @f32_arg
+; EG-CHECK: T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; SI-CHECK-LABEL: @f32_arg
+; S_LOAD_DWORD s{{[0-9]}}, s[0:1], 11
+define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
+entry:
+  store float %in, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v2i8_arg
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @v2i8_arg
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
+entry:
+  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v2i16_arg
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @v2i16_arg
+; SI-CHECK-DAG: BUFFER_LOAD_USHORT
+; SI-CHECK-DAG: BUFFER_LOAD_USHORT
+define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
+entry:
+  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v2i32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
+; SI-CHECK-LABEL: @v2i32_arg
+; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 11
+define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
+entry:
+  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v2f32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
+; SI-CHECK-LABEL: @v2f32_arg
+; SI-CHECK: S_LOAD_DWORDX2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 11
+define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
+entry:
+  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v3i8_arg
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
+; VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
+; SI-CHECK-LABEL: @v3i8_arg
+define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+entry:
+  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v3i16_arg
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
+; VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
+; SI-CHECK-LABEL: @v3i16_arg
+define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
+entry:
+  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+; EG-CHECK-LABEL: @v3i32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; SI-CHECK-LABEL: @v3i32_arg
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 13
+define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
+entry:
+  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v3f32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; SI-CHECK-LABEL: @v3f32_arg
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 13
+define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
+entry:
+  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v4i8_arg
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @v4i8_arg
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
+entry:
+  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v4i16_arg
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @v4i16_arg
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
+entry:
+  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v4i32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
+; SI-CHECK-LABEL: @v4i32_arg
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 13
+define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v4f32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
+; SI-CHECK-LABEL: @v4f32_arg
+; SI-CHECK: S_LOAD_DWORDX4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 13
+define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
+entry:
+  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v8i8_arg
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @v8i8_arg
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
+entry:
+  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v8i16_arg
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @v8i16_arg
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
+entry:
+  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v8i32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+; SI-CHECK-LABEL: @v8i32_arg
+; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 17
+define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
+entry:
+  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v8f32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
+; SI-CHECK-LABEL: @v8f32_arg
+; SI-CHECK: S_LOAD_DWORDX8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 17
+define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
+entry:
+  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v16i8_arg
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; EG-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @v16i8_arg
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
+entry:
+  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v16i16_arg
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; EG-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @v16i16_arg
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
+entry:
+  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @v16i32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
+; SI-CHECK-LABEL: @v16i32_arg
+; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 25
+define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
+entry:
+  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
+  ret void
+}
+
+; EG-CHECK-LABEL: @v16f32_arg
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
+; EG-CHECK-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
+; SI-CHECK-LABEL: @v16f32_arg
+; SI-CHECK: S_LOAD_DWORDX16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 25
+define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
+entry:
+  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/lds-output-queue.ll b/test/CodeGen/R600/lds-output-queue.ll
new file mode 100644
index 0000000..63a4332
--- /dev/null
+++ b/test/CodeGen/R600/lds-output-queue.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
+;
+; This test checks that the lds input queue will is empty at the end of
+; the ALU clause.
+
+; CHECK-LABEL: @lds_input_queue
+; CHECK: LDS_READ_RET * OQAP
+; CHECK-NOT: ALU clause
+; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
+
+@local_mem = internal addrspace(3) unnamed_addr global [2 x i32] [i32 1, i32 2], align 4
+
+define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+  %1 = load i32 addrspace(3)* %0
+  call void @llvm.AMDGPU.barrier.local()
+
+  ; This will start a new clause for the vertex fetch
+  %2 = load i32 addrspace(1)* %in
+  %3 = add i32 %1, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local()
+
+; The machine scheduler does not do proper alias analysis and assumes that
+; loads from global values (Note that a global value is different that a
+; value from global memory.  A global value is a value that is declared
+; outside of a function, it can reside in any address space) alias with
+; all other loads.
+;
+; This is a problem for scheduling the reads from the local data share (lds).
+; These reads are implemented using two instructions.  The first copies the
+; data from lds into the lds output queue, and the second moves the data from
+; the input queue into main memory.  These two instructions don't have to be
+; scheduled one after the other, but they do need to be scheduled in the same
+; clause.  The aliasing problem mentioned above causes problems when there is a
+; load from global memory which immediately follows a load from a global value that
+; has been declared in the local memory space:
+;
+;  %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+;  %1 = load i32 addrspace(3)* %0
+;  %2 = load i32 addrspace(1)* %in
+;
+; The instruction selection phase will generate ISA that looks like this:
+; %OQAP = LDS_READ_RET
+; %vreg0 = MOV %OQAP
+; %vreg1 = VTX_READ_32
+; %vreg2 = ADD_INT %vreg1, %vreg0
+;
+; The bottom scheduler will schedule the two ALU instructions first:
+;
+; UNSCHEDULED:
+; %OQAP = LDS_READ_RET
+; %vreg1 = VTX_READ_32
+;
+; SCHEDULED:
+;
+; vreg0 = MOV %OQAP
+; vreg2 = ADD_INT %vreg1, %vreg2
+;
+; The lack of proper aliasing results in the local memory read (LDS_READ_RET)
+; to consider the global memory read (VTX_READ_32) has a chain dependency, so
+; the global memory read will always be scheduled first.  This will give us a
+; final program which looks like this:
+;
+; Alu clause:
+; %OQAP = LDS_READ_RET
+; VTX clause:
+; %vreg1 = VTX_READ_32
+; Alu clause:
+; vreg0 = MOV %OQAP
+; vreg2 = ADD_INT %vreg1, %vreg2
+;
+; This is an illegal program because the OQAP def and use know occur in
+; different ALU clauses.
+;
+; This test checks this scenario and makes sure it doesn't result in an
+; illegal program.  For now, we have fixed this issue by merging the
+; LDS_READ_RET and MOV together during instruction selection and then
+; expanding them after scheduling.  Once the scheduler has better alias
+; analysis, we should be able to keep these instructions sparate before
+; scheduling.
+;
+; CHECK-LABEL: @local_global_alias
+; CHECK: LDS_READ_RET
+; CHECK-NOT: ALU clause
+; CHECK MOV * T{{[0-9]\.[XYZW]}}, OQAP
+define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
+  %1 = load i32 addrspace(3)* %0
+  %2 = load i32 addrspace(1)* %in
+  %3 = add i32 %2, %1
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/lds-size.ll b/test/CodeGen/R600/lds-size.ll
new file mode 100644
index 0000000..2185180
--- /dev/null
+++ b/test/CodeGen/R600/lds-size.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test makes sure we do not double count global values when they are
+; used in different basic blocks.
+
+; CHECK-LABEL: @test
+; CHECK: .long   166120
+; CHECK-NEXT: .long   1
+@lds = internal addrspace(3) unnamed_addr global i32 zeroinitializer, align 4
+
+define void @test(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %if, label %else
+
+if:
+  store i32 1, i32 addrspace(3)* @lds
+  br label %endif
+
+else:
+  store i32 2, i32 addrspace(3)* @lds
+  br label %endif
+
+endif:
+  ret void
+}
diff --git a/test/CodeGen/R600/lit.local.cfg b/test/CodeGen/R600/lit.local.cfg
index 36ee493..2d8930a 100644
--- a/test/CodeGen/R600/lit.local.cfg
+++ b/test/CodeGen/R600/lit.local.cfg
@@ -1,13 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
-targets = set(root.targets_to_build.split())
+targets = set(config.root.targets_to_build.split())
 if not 'R600' in targets:
     config.unsupported = True
-
diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
index 77b168e..47191e0 100644
--- a/test/CodeGen/R600/literals.ll
+++ b/test/CodeGen/R600/literals.ll
@@ -7,7 +7,8 @@
 ; ADD_INT literal.x KC0[2].Z, 5
 
 ; CHECK: @i32_literal
-; CHECK: ADD_INT * T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK: ADD_INT {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5
 define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -23,7 +24,8 @@ entry:
 ; ADD literal.x KC0[2].Z, 5.0
 
 ; CHECK: @float_literal
-; CHECK: ADD * T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.0
 define void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
@@ -31,3 +33,32 @@ entry:
   store float %0, float addrspace(1)* %out
   ret void
 }
+
+; Make sure inline literals are folded into REG_SEQUENCE instructions.
+; CHECK: @inline_literal_reg_sequence
+; CHECK: MOV {{\** *}}T[[GPR:[0-9]]].X, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Y, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
+; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0
+
+define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
+entry:
+  store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; CHECK: @inline_literal_dot4
+; CHECK: DOT4 T[[GPR:[0-9]]].X, 1.0
+; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
+; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0
+; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
+define void @inline_literal_dot4(float addrspace(1)* %out) {
+entry:
+  %0 = call float @llvm.AMDGPU.dp4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imax.ll b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
index 3e854c8..1336f4e 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imax.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_MAX_I32_e32
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.imin.ll b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
index e227bf8..3435ea4 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.imin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.imin.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_MIN_I32_e32
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.mul.ll b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
index cc0732b..83b56a5 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.mul.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
@@ -2,16 +2,16 @@
 
 ;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
-   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   %vec = insertelement <4 x float> undef, float %r2, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 declare float @llvm.AMDGPU.mul(float ,float ) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
index 7627783..e6bb2c4 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ; R600-CHECK: @amdgpu_trunc
-; R600-CHECK: TRUNC * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
+; R600-CHECK: TRUNC T{{[0-9]+\.[XYZW]}}, KC0[2].Z
 ; SI-CHECK: @amdgpu_trunc
 ; SI-CHECK: V_TRUNC_F32
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umax.ll b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
index 7699c04..4cfa133 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umax.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umax.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_MAX_U32_e32
 
diff --git a/test/CodeGen/R600/llvm.AMDGPU.umin.ll b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
index a911ad9..14af051 100644
--- a/test/CodeGen/R600/llvm.AMDGPU.umin.ll
+++ b/test/CodeGen/R600/llvm.AMDGPU.umin.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_MIN_U32_e32
 
diff --git a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
index e45722c..0438ecc 100644
--- a/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
+++ b/test/CodeGen/R600/llvm.SI.fs.interp.constant.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: S_MOV_B32
 ;CHECK-NEXT: V_INTERP_MOV_F32
diff --git a/test/CodeGen/R600/llvm.SI.imageload.ll b/test/CodeGen/R600/llvm.SI.imageload.ll
index 0adcdfc..59e00f0 100644
--- a/test/CodeGen/R600/llvm.SI.imageload.ll
+++ b/test/CodeGen/R600/llvm.SI.imageload.ll
@@ -1,15 +1,15 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15, 0, 0, -1
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 3, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 2, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 1, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 4, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 8, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 5, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+}}, 12, 0, 0, -1
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7, 0, 0, 0
-;CHECK-DAG: IMAGE_LOAD_MIP {{VGPR[0-9]+}}, 8, 0, 0, -1
+;CHECK-DAG: IMAGE_LOAD {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 2, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 1, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 4, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 8, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+;CHECK-DAG: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+;CHECK-DAG: IMAGE_LOAD_MIP {{v[0-9]+}}, 8, 0, 0, -1
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
@@ -23,25 +23,25 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
    %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
    %res1 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v1,
-      <8 x i32> undef, i32 1)
+      <32 x i8> undef, i32 1)
    %res2 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v2,
-      <8 x i32> undef, i32 2)
+      <32 x i8> undef, i32 2)
    %res3 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v3,
-      <8 x i32> undef, i32 3)
+      <32 x i8> undef, i32 3)
    %res4 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v4,
-      <8 x i32> undef, i32 4)
+      <32 x i8> undef, i32 4)
    %res5 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v5,
-      <8 x i32> undef, i32 5)
+      <32 x i8> undef, i32 5)
    %res6 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v6,
-      <8 x i32> undef, i32 6)
+      <32 x i8> undef, i32 6)
    %res10 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v10,
-      <8 x i32> undef, i32 10)
+      <32 x i8> undef, i32 10)
    %res11 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v11,
-      <8 x i32> undef, i32 11)
+      <32 x i8> undef, i32 11)
    %res15 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v15,
-      <8 x i32> undef, i32 15)
+      <32 x i8> undef, i32 15)
    %res16 = call <4 x i32> @llvm.SI.imageload.(<4 x i32> %v16,
-      <8 x i32> undef, i32 16)
+      <32 x i8> undef, i32 16)
    %e1 = extractelement <4 x i32> %res1, i32 0
    %e2 = extractelement <4 x i32> %res2, i32 1
    %e3 = extractelement <4 x i32> %res3, i32 2
@@ -82,6 +82,50 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    ret void
 }
 
-declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <8 x i32>, i32) readnone
+; Test that ccordinates are stored in vgprs and not sgprs
+; CHECK: vgpr_coords
+; CHECK: IMAGE_LOAD_MIP {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, 0, 0, 0, 0, 0, {{v\[[0-9]+:[0-9]+\]}}
+define void @vgpr_coords(float addrspace(2)* addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr float addrspace(2)* addrspace(2)* %0, i32 0
+  %21 = load float addrspace(2)* addrspace(2)* %20, !tbaa !2
+  %22 = getelementptr float addrspace(2)* %21, i32 0
+  %23 = load float addrspace(2)* %22, !tbaa !2, !invariant.load !1
+  %24 = getelementptr float addrspace(2)* %21, i32 1
+  %25 = load float addrspace(2)* %24, !tbaa !2, !invariant.load !1
+  %26 = getelementptr float addrspace(2)* %21, i32 4
+  %27 = load float addrspace(2)* %26, !tbaa !2, !invariant.load !1
+  %28 = getelementptr <32 x i8> addrspace(2)* %2, i32 0
+  %29 = load <32 x i8> addrspace(2)* %28, !tbaa !2
+  %30 = bitcast float %27 to i32
+  %31 = bitcast float %23 to i32
+  %32 = bitcast float %25 to i32
+  %33 = insertelement <4 x i32> undef, i32 %31, i32 0
+  %34 = insertelement <4 x i32> %33, i32 %32, i32 1
+  %35 = insertelement <4 x i32> %34, i32 %30, i32 2
+  %36 = insertelement <4 x i32> %35, i32 undef, i32 3
+  %37 = call <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32> %36, <32 x i8> %29, i32 2)
+  %38 = extractelement <4 x i32> %37, i32 0
+  %39 = extractelement <4 x i32> %37, i32 1
+  %40 = extractelement <4 x i32> %37, i32 2
+  %41 = extractelement <4 x i32> %37, i32 3
+  %42 = bitcast i32 %38 to float
+  %43 = bitcast i32 %39 to float
+  %44 = bitcast i32 %40 to float
+  %45 = bitcast i32 %41 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %42, float %43, float %44, float %45)
+  ret void
+}
+
+declare <4 x i32> @llvm.SI.imageload.(<4 x i32>, <32 x i8>, i32) readnone
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.SI.imageload.v4i32(<4 x i32>, <32 x i8>, i32) #1
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+
+!0 = metadata !{metadata !"const", null}
+!1 = metadata !{}
+!2 = metadata !{metadata !0, metadata !0, i64 0, i32 1}
diff --git a/test/CodeGen/R600/llvm.SI.resinfo.ll b/test/CodeGen/R600/llvm.SI.resinfo.ll
index eb31514..af3afc1 100644
--- a/test/CodeGen/R600/llvm.SI.resinfo.ll
+++ b/test/CodeGen/R600/llvm.SI.resinfo.ll
@@ -1,40 +1,40 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 3, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+}}, 2, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+}}, 1, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+}}, 4, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+}}, 8, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 5, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 9, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 6, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 10, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+}}, 12, 0, 0, -1
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14, 0, 0, 0
-;CHECK: IMAGE_GET_RESINFO {{VGPR[0-9]+}}, 8, 0, 0, -1
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 15, 0, 0, -1
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 3, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 2, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 1, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 4, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 8, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 5, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 9, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 6, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 10, 0, 0, -1
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 12, 0, 0, -1
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 7, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 11, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 13, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v\[[0-9]+:[0-9]+\]}}, 14, 0, 0, 0
+;CHECK: IMAGE_GET_RESINFO {{v[0-9]+}}, 8, 0, 0, -1
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8,
 		  i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) {
-   %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <8 x i32> undef, i32 1)
-   %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <8 x i32> undef, i32 2)
-   %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <8 x i32> undef, i32 3)
-   %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <8 x i32> undef, i32 4)
-   %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <8 x i32> undef, i32 5)
-   %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <8 x i32> undef, i32 6)
-   %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <8 x i32> undef, i32 7)
-   %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <8 x i32> undef, i32 8)
-   %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <8 x i32> undef, i32 9)
-   %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <8 x i32> undef, i32 10)
-   %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <8 x i32> undef, i32 11)
-   %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <8 x i32> undef, i32 12)
-   %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <8 x i32> undef, i32 13)
-   %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <8 x i32> undef, i32 14)
-   %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <8 x i32> undef, i32 15)
-   %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <8 x i32> undef, i32 16)
+   %res1 = call <4 x i32> @llvm.SI.resinfo(i32 %a1, <32 x i8> undef, i32 1)
+   %res2 = call <4 x i32> @llvm.SI.resinfo(i32 %a2, <32 x i8> undef, i32 2)
+   %res3 = call <4 x i32> @llvm.SI.resinfo(i32 %a3, <32 x i8> undef, i32 3)
+   %res4 = call <4 x i32> @llvm.SI.resinfo(i32 %a4, <32 x i8> undef, i32 4)
+   %res5 = call <4 x i32> @llvm.SI.resinfo(i32 %a5, <32 x i8> undef, i32 5)
+   %res6 = call <4 x i32> @llvm.SI.resinfo(i32 %a6, <32 x i8> undef, i32 6)
+   %res7 = call <4 x i32> @llvm.SI.resinfo(i32 %a7, <32 x i8> undef, i32 7)
+   %res8 = call <4 x i32> @llvm.SI.resinfo(i32 %a8, <32 x i8> undef, i32 8)
+   %res9 = call <4 x i32> @llvm.SI.resinfo(i32 %a9, <32 x i8> undef, i32 9)
+   %res10 = call <4 x i32> @llvm.SI.resinfo(i32 %a10, <32 x i8> undef, i32 10)
+   %res11 = call <4 x i32> @llvm.SI.resinfo(i32 %a11, <32 x i8> undef, i32 11)
+   %res12 = call <4 x i32> @llvm.SI.resinfo(i32 %a12, <32 x i8> undef, i32 12)
+   %res13 = call <4 x i32> @llvm.SI.resinfo(i32 %a13, <32 x i8> undef, i32 13)
+   %res14 = call <4 x i32> @llvm.SI.resinfo(i32 %a14, <32 x i8> undef, i32 14)
+   %res15 = call <4 x i32> @llvm.SI.resinfo(i32 %a15, <32 x i8> undef, i32 15)
+   %res16 = call <4 x i32> @llvm.SI.resinfo(i32 %a16, <32 x i8> undef, i32 16)
    %e1 = extractelement <4 x i32> %res1, i32 0
    %e2 = extractelement <4 x i32> %res2, i32 1
    %e3 = extractelement <4 x i32> %res3, i32 2
@@ -105,6 +105,6 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7,
    ret void
 }
 
-declare <4 x i32> @llvm.SI.resinfo(i32, <8 x i32>, i32) readnone
+declare <4 x i32> @llvm.SI.resinfo(i32, <32 x i8>, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.sample-masked.ll b/test/CodeGen/R600/llvm.SI.sample-masked.ll
new file mode 100644
index 0000000..e5e4ec4
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.sample-masked.ll
@@ -0,0 +1,93 @@
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+
+; CHECK-LABEL: @v1
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 13
+define void @v1(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 2
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: @v2
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 11
+define void @v2(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: @v3
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 14
+define void @v3(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 2
+  %4 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: @v4
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 7
+define void @v4(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 2
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+  ret void
+}
+
+; CHECK-LABEL: @v5
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 10
+define void @v5(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+; CHECK-LABEL: @v6
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 6
+define void @v6(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 1
+  %3 = extractelement <4 x float> %1, i32 2
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+; CHECK-LABEL: @v7
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 9
+define void @v7(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+  ret void
+}
+
+declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.sample.ll b/test/CodeGen/R600/llvm.SI.sample.ll
index 7655996..d41737c 100644
--- a/test/CodeGen/R600/llvm.SI.sample.ll
+++ b/test/CodeGen/R600/llvm.SI.sample.ll
@@ -1,21 +1,21 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 3
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 2
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 1
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 4
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 8
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 5
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 9
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+}}, 6
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 10
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+}}, 12
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11
-;CHECK-DAG: IMAGE_SAMPLE_C {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14
-;CHECK-DAG: IMAGE_SAMPLE {{VGPR[0-9]+}}, 8
+;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 2
+;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 1
+;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 4
+;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 8
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: IMAGE_SAMPLE_C {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: IMAGE_SAMPLE {{v[0-9]+}}, 8
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
@@ -35,37 +35,37 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
    %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
    %res1 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v1,
-      <8 x i32> undef, <4 x i32> undef, i32 1)
+      <32 x i8> undef, <16 x i8> undef, i32 1)
    %res2 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v2,
-      <8 x i32> undef, <4 x i32> undef, i32 2)
+      <32 x i8> undef, <16 x i8> undef, i32 2)
    %res3 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v3,
-      <8 x i32> undef, <4 x i32> undef, i32 3)
+      <32 x i8> undef, <16 x i8> undef, i32 3)
    %res4 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v4,
-      <8 x i32> undef, <4 x i32> undef, i32 4)
+      <32 x i8> undef, <16 x i8> undef, i32 4)
    %res5 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v5,
-      <8 x i32> undef, <4 x i32> undef, i32 5)
+      <32 x i8> undef, <16 x i8> undef, i32 5)
    %res6 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v6,
-      <8 x i32> undef, <4 x i32> undef, i32 6)
+      <32 x i8> undef, <16 x i8> undef, i32 6)
    %res7 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v7,
-      <8 x i32> undef, <4 x i32> undef, i32 7)
+      <32 x i8> undef, <16 x i8> undef, i32 7)
    %res8 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v8,
-      <8 x i32> undef, <4 x i32> undef, i32 8)
+      <32 x i8> undef, <16 x i8> undef, i32 8)
    %res9 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v9,
-      <8 x i32> undef, <4 x i32> undef, i32 9)
+      <32 x i8> undef, <16 x i8> undef, i32 9)
    %res10 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v10,
-      <8 x i32> undef, <4 x i32> undef, i32 10)
+      <32 x i8> undef, <16 x i8> undef, i32 10)
    %res11 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v11,
-      <8 x i32> undef, <4 x i32> undef, i32 11)
+      <32 x i8> undef, <16 x i8> undef, i32 11)
    %res12 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v12,
-      <8 x i32> undef, <4 x i32> undef, i32 12)
+      <32 x i8> undef, <16 x i8> undef, i32 12)
    %res13 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v13,
-      <8 x i32> undef, <4 x i32> undef, i32 13)
+      <32 x i8> undef, <16 x i8> undef, i32 13)
    %res14 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v14,
-      <8 x i32> undef, <4 x i32> undef, i32 14)
+      <32 x i8> undef, <16 x i8> undef, i32 14)
    %res15 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v15,
-      <8 x i32> undef, <4 x i32> undef, i32 15)
+      <32 x i8> undef, <16 x i8> undef, i32 15)
    %res16 = call <4 x float> @llvm.SI.sample.(<4 x i32> %v16,
-      <8 x i32> undef, <4 x i32> undef, i32 16)
+      <32 x i8> undef, <16 x i8> undef, i32 16)
    %e1 = extractelement <4 x float> %res1, i32 0
    %e2 = extractelement <4 x float> %res2, i32 1
    %e3 = extractelement <4 x float> %res3, i32 2
@@ -135,6 +135,23 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    ret void
 }
 
-declare <4 x float> @llvm.SI.sample.(<4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
+; CHECK: @v1
+; CHECK: IMAGE_SAMPLE {{v\[[0-9]+:[0-9]+\]}}, 15
+define void @v1(i32 %a1) {
+entry:
+  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+  %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = extractelement <4 x float> %1, i32 1
+  %4 = extractelement <4 x float> %1, i32 2
+  %5 = extractelement <4 x float> %1, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %5)
+  ret void
+}
+
+
+declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare <4 x float> @llvm.SI.sample.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.sampled.ll b/test/CodeGen/R600/llvm.SI.sampled.ll
index 3b05551..21ac725 100644
--- a/test/CodeGen/R600/llvm.SI.sampled.ll
+++ b/test/CodeGen/R600/llvm.SI.sampled.ll
@@ -1,21 +1,21 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 15
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 3
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 2
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 1
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 4
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 5
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 9
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+}}, 6
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 10
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+}}, 12
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 7
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 11
-;CHECK-DAG: IMAGE_SAMPLE_C_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 13
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+_VGPR[0-9]+_VGPR[0-9]+}}, 14
-;CHECK-DAG: IMAGE_SAMPLE_D {{VGPR[0-9]+}}, 8
+;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 15
+;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 3
+;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 2
+;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 1
+;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 4
+;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 8
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 5
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 9
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 6
+;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 10
+;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 12
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 7
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 11
+;CHECK-DAG: IMAGE_SAMPLE_C_D {{v\[[0-9]+:[0-9]+\]}}, 13
+;CHECK-DAG: IMAGE_SAMPLE_D {{v\[[0-9]+:[0-9]+\]}}, 14
+;CHECK-DAG: IMAGE_SAMPLE_D {{v[0-9]+}}, 8
 
 define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v1 = insertelement <4 x i32> undef, i32 %a1, i32 0
@@ -35,37 +35,37 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    %v15 = insertelement <4 x i32> undef, i32 %a4, i32 2
    %v16 = insertelement <4 x i32> undef, i32 %a4, i32 3
    %res1 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v1,
-      <8 x i32> undef, <4 x i32> undef, i32 1)
+      <32 x i8> undef, <16 x i8> undef, i32 1)
    %res2 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v2,
-      <8 x i32> undef, <4 x i32> undef, i32 2)
+      <32 x i8> undef, <16 x i8> undef, i32 2)
    %res3 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v3,
-      <8 x i32> undef, <4 x i32> undef, i32 3)
+      <32 x i8> undef, <16 x i8> undef, i32 3)
    %res4 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v4,
-      <8 x i32> undef, <4 x i32> undef, i32 4)
+      <32 x i8> undef, <16 x i8> undef, i32 4)
    %res5 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v5,
-      <8 x i32> undef, <4 x i32> undef, i32 5)
+      <32 x i8> undef, <16 x i8> undef, i32 5)
    %res6 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v6,
-      <8 x i32> undef, <4 x i32> undef, i32 6)
+      <32 x i8> undef, <16 x i8> undef, i32 6)
    %res7 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v7,
-      <8 x i32> undef, <4 x i32> undef, i32 7)
+      <32 x i8> undef, <16 x i8> undef, i32 7)
    %res8 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v8,
-      <8 x i32> undef, <4 x i32> undef, i32 8)
+      <32 x i8> undef, <16 x i8> undef, i32 8)
    %res9 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v9,
-      <8 x i32> undef, <4 x i32> undef, i32 9)
+      <32 x i8> undef, <16 x i8> undef, i32 9)
    %res10 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v10,
-      <8 x i32> undef, <4 x i32> undef, i32 10)
+      <32 x i8> undef, <16 x i8> undef, i32 10)
    %res11 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v11,
-      <8 x i32> undef, <4 x i32> undef, i32 11)
+      <32 x i8> undef, <16 x i8> undef, i32 11)
    %res12 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v12,
-      <8 x i32> undef, <4 x i32> undef, i32 12)
+      <32 x i8> undef, <16 x i8> undef, i32 12)
    %res13 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v13,
-      <8 x i32> undef, <4 x i32> undef, i32 13)
+      <32 x i8> undef, <16 x i8> undef, i32 13)
    %res14 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v14,
-      <8 x i32> undef, <4 x i32> undef, i32 14)
+      <32 x i8> undef, <16 x i8> undef, i32 14)
    %res15 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v15,
-      <8 x i32> undef, <4 x i32> undef, i32 15)
+      <32 x i8> undef, <16 x i8> undef, i32 15)
    %res16 = call <4 x float> @llvm.SI.sampled.(<4 x i32> %v16,
-      <8 x i32> undef, <4 x i32> undef, i32 16)
+      <32 x i8> undef, <16 x i8> undef, i32 16)
    %e1 = extractelement <4 x float> %res1, i32 0
    %e2 = extractelement <4 x float> %res2, i32 1
    %e3 = extractelement <4 x float> %res3, i32 2
@@ -135,6 +135,6 @@ define void @test(i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
    ret void
 }
 
-declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
+declare <4 x float> @llvm.SI.sampled.(<4 x i32>, <32 x i8>, <16 x i8>, i32) readnone
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/R600/llvm.SI.tbuffer.store.ll b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
new file mode 100644
index 0000000..fa7c3ca
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.tbuffer.store.ll
@@ -0,0 +1,44 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+;CHECK_LABEL: @test1
+;CHECK: TBUFFER_STORE_FORMAT_XYZW {{v\[[0-9]+:[0-9]+\]}}, 32, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test1(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK_LABEL: @test2
+;CHECK: TBUFFER_STORE_FORMAT_XYZ {{v\[[0-9]+:[0-9]+\]}}, 24, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test2(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK_LABEL: @test3
+;CHECK: TBUFFER_STORE_FORMAT_XY {{v\[[0-9]+:[0-9]+\]}}, 16, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test3(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
+        i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK_LABEL: @test4
+;CHECK: TBUFFER_STORE_FORMAT_X {{v[0-9]+}}, 8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define void @test4(i32 %vdata, i32 %vaddr) {
+    call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
+        i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/R600/llvm.SI.tid.ll b/test/CodeGen/R600/llvm.SI.tid.ll
index 238d9f2..fe17304 100644
--- a/test/CodeGen/R600/llvm.SI.tid.ll
+++ b/test/CodeGen/R600/llvm.SI.tid.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_MBCNT_LO_U32_B32_e64
 ;CHECK: V_MBCNT_HI_U32_B32_e32
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
index 8fb4559..aaf2305 100644
--- a/test/CodeGen/R600/llvm.cos.ll
+++ b/test/CodeGen/R600/llvm.cos.ll
@@ -5,15 +5,15 @@
 ;CHECK: ADD *
 ;CHECK: COS * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @llvm.cos.f32(float %r0)
-   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
 declare float @llvm.cos.f32(float) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.floor.ll b/test/CodeGen/R600/llvm.floor.ll
new file mode 100644
index 0000000..f7071cd
--- /dev/null
+++ b/test/CodeGen/R600/llvm.floor.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK: @f32
+; R600-CHECK: FLOOR
+; SI-CHECK: @f32
+; SI-CHECK: V_FLOOR_F32_e32
+define void @f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.floor.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @v2f32
+; R600-CHECK: FLOOR
+; R600-CHECK: FLOOR
+; SI-CHECK: @v2f32
+; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: V_FLOOR_F32_e32
+define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.floor.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @v4f32
+; R600-CHECK: FLOOR
+; R600-CHECK: FLOOR
+; R600-CHECK: FLOOR
+; R600-CHECK: FLOOR
+; SI-CHECK: @v4f32
+; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: V_FLOOR_F32_e32
+; SI-CHECK: V_FLOOR_F32_e32
+define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.floor.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.floor.f32(float) #0
+
+; Function Attrs: nounwind readonly
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) #0
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
index 0f51cf4..b587d2b 100644
--- a/test/CodeGen/R600/llvm.pow.ll
+++ b/test/CodeGen/R600/llvm.pow.ll
@@ -4,16 +4,16 @@
 ;CHECK: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}
 ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
-   %r1 = call float @llvm.R600.load.input(i32 1)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
+   %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = call float @llvm.pow.f32( float %r0, float %r1)
-   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   %vec = insertelement <4 x float> undef, float %r2, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 declare float @llvm.pow.f32(float ,float ) readonly
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.rint.ll b/test/CodeGen/R600/llvm.rint.ll
new file mode 100644
index 0000000..c174b33
--- /dev/null
+++ b/test/CodeGen/R600/llvm.rint.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK: @f32
+; R600-CHECK: RNDNE
+; SI-CHECK: @f32
+; SI-CHECK: V_RNDNE_F32_e32
+define void @f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.rint.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @v2f32
+; R600-CHECK: RNDNE
+; R600-CHECK: RNDNE
+; SI-CHECK: @v2f32
+; SI-CHECK: V_RNDNE_F32_e32
+; SI-CHECK: V_RNDNE_F32_e32
+define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK: @v4f32
+; R600-CHECK: RNDNE
+; R600-CHECK: RNDNE
+; R600-CHECK: RNDNE
+; R600-CHECK: RNDNE
+; SI-CHECK: @v4f32
+; SI-CHECK: V_RNDNE_F32_e32
+; SI-CHECK: V_RNDNE_F32_e32
+; SI-CHECK: V_RNDNE_F32_e32
+; SI-CHECK: V_RNDNE_F32_e32
+define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare float @llvm.rint.f32(float) #0
+
+; Function Attrs: nounwind readonly
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) #0
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/R600/llvm.round.ll b/test/CodeGen/R600/llvm.round.ll
new file mode 100644
index 0000000..e06d45d
--- /dev/null
+++ b/test/CodeGen/R600/llvm.round.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600 --check-prefix=FUNC
+
+; FUNC-LABEL: @f32
+; R600: FRACT
+; R600-DAG: ADD
+; R600-DAG: CEIL
+; R600-DAG: FLOOR
+; R600: CNDGE
+define void @f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.round.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; The vector tests are really difficult to verify, since it can be hard to
+; predict how the scheduler will order the instructions.  We already have
+; a test for the scalar case, so the vector tests just check that the
+; compiler doesn't crash.
+
+; FUNC-LABEL: v2f32
+; R600: CF_END
+define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.round.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: v4f32
+; R600: CF_END
+define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.round.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.round.f32(float)
+declare <2 x float> @llvm.round.v2f32(<2 x float>)
+declare <4 x float> @llvm.round.v4f32(<4 x float>)
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
index e94c2ba..9eb9983 100644
--- a/test/CodeGen/R600/llvm.sin.ll
+++ b/test/CodeGen/R600/llvm.sin.ll
@@ -5,15 +5,15 @@
 ;CHECK: ADD *
 ;CHECK: SIN * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
+define void @test(<4 x float> inreg %reg0) #0 {
+   %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = call float @llvm.sin.f32( float %r0)
-   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
 declare float @llvm.sin.f32(float) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare float @llvm.R600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/llvm.sqrt.ll b/test/CodeGen/R600/llvm.sqrt.ll
new file mode 100644
index 0000000..0d0d186
--- /dev/null
+++ b/test/CodeGen/R600/llvm.sqrt.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -march=r600 --mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 --mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+
+; R600-CHECK-LABEL: @sqrt_f32
+; R600-CHECK: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
+; R600-CHECK: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
+; SI-CHECK-LABEL: @sqrt_f32
+; SI-CHECK: V_SQRT_F32_e32
+define void @sqrt_f32(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = call float @llvm.sqrt.f32(float %in)
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @sqrt_v2f32
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
+; SI-CHECK-LABEL: @sqrt_v2f32
+; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK: V_SQRT_F32_e32
+define void @sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+entry:
+  %0 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @sqrt_v4f32
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
+; R600-CHECK-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
+; R600-CHECK-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
+; SI-CHECK-LABEL: @sqrt_v4f32
+; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK: V_SQRT_F32_e32
+; SI-CHECK: V_SQRT_F32_e32
+define void @sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+entry:
+  %0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
+  store <4 x float> %0, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float %in)
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
diff --git a/test/CodeGen/R600/load-input-fold.ll b/test/CodeGen/R600/load-input-fold.ll
index aff2a6e..ca86d0e 100644
--- a/test/CodeGen/R600/load-input-fold.ll
+++ b/test/CodeGen/R600/load-input-fold.ll
@@ -1,20 +1,20 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman
 ;REQUIRES: asserts
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
-  %4 = call float @llvm.R600.load.input(i32 8)
-  %5 = call float @llvm.R600.load.input(i32 9)
-  %6 = call float @llvm.R600.load.input(i32 10)
-  %7 = call float @llvm.R600.load.input(i32 11)
-  %8 = call float @llvm.R600.load.input(i32 12)
-  %9 = call float @llvm.R600.load.input(i32 13)
-  %10 = call float @llvm.R600.load.input(i32 14)
-  %11 = call float @llvm.R600.load.input(i32 15)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = extractelement <4 x float> %reg2, i32 1
+  %6 = extractelement <4 x float> %reg2, i32 2
+  %7 = extractelement <4 x float> %reg2, i32 3
+  %8 = extractelement <4 x float> %reg3, i32 0
+  %9 = extractelement <4 x float> %reg3, i32 1
+  %10 = extractelement <4 x float> %reg3, i32 2
+  %11 = extractelement <4 x float> %reg3, i32 3
   %12 = load <4 x float> addrspace(8)* null
   %13 = extractelement <4 x float> %12, i32 0
   %14 = fmul float %0, %13
@@ -96,9 +96,6 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
-; Function Attrs: readnone
 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
 
 ; Function Attrs: readonly
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index f478ef5..e4492d7 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -1,17 +1,17 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK  %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK  %s
 
 ;===------------------------------------------------------------------------===;
 ; GLOBAL ADDRESS SPACE
 ;===------------------------------------------------------------------------===;
 
 ; Load an i8 value from the global address space.
-; R600-CHECK: @load_i8
+; R600-CHECK-LABEL: @load_i8
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
 
-; SI-CHECK: @load_i8
-; SI-CHECK: BUFFER_LOAD_UBYTE VGPR{{[0-9]+}},
+; SI-CHECK-LABEL: @load_i8
+; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
 define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8 addrspace(1)* %in
   %2 = zext i8 %1 to i32
@@ -19,13 +19,13 @@ define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   ret void
 }
 
-; R600-CHECK: @load_i8_sext
+; R600-CHECK-LABEL: @load_i8_sext
 ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 24
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 24
-; SI-CHECK: @load_i8_sext
+; SI-CHECK-LABEL: @load_i8_sext
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
@@ -35,10 +35,98 @@ entry:
   ret void
 }
 
+; R600-CHECK-LABEL: @load_v2i8
+; R600-CHECK: VTX_READ_8
+; R600-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @load_v2i8
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @load_v2i8(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i8> addrspace(1)* %in
+  %1 = zext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i8_sext
+; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-CHECK-DAG: 24
+; SI-CHECK-LABEL: @load_v2i8_sext
+; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: BUFFER_LOAD_SBYTE
+define void @load_v2i8_sext(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i8> addrspace(1)* %in
+  %1 = sext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i8
+; R600-CHECK: VTX_READ_8
+; R600-CHECK: VTX_READ_8
+; R600-CHECK: VTX_READ_8
+; R600-CHECK: VTX_READ_8
+; SI-CHECK-LABEL: @load_v4i8
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+; SI-CHECK: BUFFER_LOAD_UBYTE
+define void @load_v4i8(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i8> addrspace(1)* %in
+  %1 = zext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i8_sext
+; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; R600-CHECK-DAG: VTX_READ_8 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Z_CHAN:[XYZW]]], [[DST_Z]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Z_CHAN]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_W_CHAN:[XYZW]]], [[DST_W]]
+; R600-CHECK-DAG: 24
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
+; R600-CHECK-DAG: 24
+; SI-CHECK-LABEL: @load_v4i8_sext
+; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: BUFFER_LOAD_SBYTE
+; SI-CHECK: BUFFER_LOAD_SBYTE
+define void @load_v4i8_sext(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i8> addrspace(1)* %in
+  %1 = sext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; Load an i16 value from the global address space.
-; R600-CHECK: @load_i16
+; R600-CHECK-LABEL: @load_i16
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @load_i16
+; SI-CHECK-LABEL: @load_i16
 ; SI-CHECK: BUFFER_LOAD_USHORT
 define void @load_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
@@ -48,13 +136,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @load_i16_sext
+; R600-CHECK-LABEL: @load_i16_sext
 ; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 16
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 16
-; SI-CHECK: @load_i16_sext
+; SI-CHECK-LABEL: @load_i16_sext
 ; SI-CHECK: BUFFER_LOAD_SSHORT
 define void @load_i16_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
@@ -64,12 +152,100 @@ entry:
   ret void
 }
 
+; R600-CHECK-LABEL: @load_v2i16
+; R600-CHECK: VTX_READ_16
+; R600-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @load_v2i16
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @load_v2i16(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i16> addrspace(1)* %in
+  %1 = zext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i16_sext
+; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-CHECK-DAG: 16
+; SI-CHECK-LABEL: @load_v2i16_sext
+; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: BUFFER_LOAD_SSHORT
+define void @load_v2i16_sext(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <2 x i16> addrspace(1)* %in
+  %1 = sext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i16
+; R600-CHECK: VTX_READ_16
+; R600-CHECK: VTX_READ_16
+; R600-CHECK: VTX_READ_16
+; R600-CHECK: VTX_READ_16
+; SI-CHECK-LABEL: @load_v4i16
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+; SI-CHECK: BUFFER_LOAD_USHORT
+define void @load_v4i16(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i16> addrspace(1)* %in
+  %1 = zext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i16_sext
+; R600-CHECK-DAG: VTX_READ_16 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: VTX_READ_16 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: VTX_READ_16 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
+; R600-CHECK-DAG: VTX_READ_16 [[DST_W:T[0-9]\.[XYZW]]], [[DST_W]]
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_X_CHAN]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Y_CHAN:[XYZW]]], [[DST_Y]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_Z_CHAN:[XYZW]]], [[DST_Z]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Z_CHAN]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_W_CHAN:[XYZW]]], [[DST_W]]
+; R600-CHECK-DAG: 16
+; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
+; R600-CHECK-DAG: 16
+; SI-CHECK-LABEL: @load_v4i16_sext
+; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: BUFFER_LOAD_SSHORT
+; SI-CHECK: BUFFER_LOAD_SSHORT
+define void @load_v4i16_sext(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+entry:
+  %0 = load <4 x i16> addrspace(1)* %in
+  %1 = sext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
 ; load an i32 value from the global address space.
-; R600-CHECK: @load_i32
+; R600-CHECK-LABEL: @load_i32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: @load_i32
-; SI-CHECK: BUFFER_LOAD_DWORD VGPR{{[0-9]+}}
+; SI-CHECK-LABEL: @load_i32
+; SI-CHECK: BUFFER_LOAD_DWORD v{{[0-9]+}}
 define void @load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32 addrspace(1)* %in
@@ -78,11 +254,11 @@ entry:
 }
 
 ; load a f32 value from the global address space.
-; R600-CHECK: @load_f32
+; R600-CHECK-LABEL: @load_f32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: @load_f32
-; SI-CHECK: BUFFER_LOAD_DWORD VGPR{{[0-9]+}}
+; SI-CHECK-LABEL: @load_f32
+; SI-CHECK: BUFFER_LOAD_DWORD v{{[0-9]+}}
 define void @load_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float addrspace(1)* %in
@@ -91,10 +267,10 @@ entry:
 }
 
 ; load a v2f32 value from the global address space
-; R600-CHECK: @load_v2f32
+; R600-CHECK-LABEL: @load_v2f32
 ; R600-CHECK: VTX_READ_64
 
-; SI-CHECK: @load_v2f32
+; SI-CHECK-LABEL: @load_v2f32
 ; SI-CHECK: BUFFER_LOAD_DWORDX2
 define void @load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 entry:
@@ -103,11 +279,11 @@ entry:
   ret void
 }
 
-; R600-CHECK: @load_i64
-; R600-CHECK: RAT
-; R600-CHECK: RAT
+; R600-CHECK-LABEL: @load_i64
+; R600-CHECK: MEM_RAT
+; R600-CHECK: MEM_RAT
 
-; SI-CHECK: @load_i64
+; SI-CHECK-LABEL: @load_i64
 ; SI-CHECK: BUFFER_LOAD_DWORDX2
 define void @load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
@@ -116,15 +292,13 @@ entry:
   ret void
 }
 
-; R600-CHECK: @load_i64_sext
-; R600-CHECK: RAT
-; R600-CHECK: RAT
+; R600-CHECK-LABEL: @load_i64_sext
+; R600-CHECK: MEM_RAT
+; R600-CHECK: MEM_RAT
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.x
 ; R600-CHECK: 31
-; SI-CHECK: @load_i64_sext
-; SI-CHECK: BUFFER_LOAD_DWORDX2 [[VAL:VGPR[0-9]_VGPR[0-9]]]
-; SI-CHECK: V_LSHL_B64 [[LSHL:VGPR[0-9]_VGPR[0-9]]], [[VAL]], 32
-; SI-CHECK: V_ASHR_I64 VGPR{{[0-9]}}_VGPR{{[0-9]}}, [[LSHL]], 32
+; SI-CHECK-LABEL: @load_i64_sext
+; SI-CHECK: BUFFER_LOAD_DWORDX2 [[VAL:v\[[0-9]:[0-9]\]]]
 
 define void @load_i64_sext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
@@ -134,9 +308,9 @@ entry:
   ret void
 }
 
-; R600-CHECK: @load_i64_zext
-; R600-CHECK: RAT
-; R600-CHECK: RAT
+; R600-CHECK-LABEL: @load_i64_zext
+; R600-CHECK: MEM_RAT
+; R600-CHECK: MEM_RAT
 define void @load_i64_zext(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32 addrspace(1)* %in
@@ -150,14 +324,14 @@ entry:
 ;===------------------------------------------------------------------------===;
 
 ; Load a sign-extended i8 value
-; R600-CHECK: @load_const_i8_sext
+; R600-CHECK-LABEL: @load_const_i8_sext
 ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 24
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 24
-; SI-CHECK: @load_const_i8_sext
-; SI-CHECK: BUFFER_LOAD_SBYTE VGPR{{[0-9]+}},
+; SI-CHECK-LABEL: @load_const_i8_sext
+; SI-CHECK: BUFFER_LOAD_SBYTE v{{[0-9]+}},
 define void @load_const_i8_sext(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8 addrspace(2)* %in
@@ -167,10 +341,10 @@ entry:
 }
 
 ; Load an aligned i8 value
-; R600-CHECK: @load_const_i8_aligned
+; R600-CHECK-LABEL: @load_const_i8_aligned
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @load_const_i8_aligned
-; SI-CHECK: BUFFER_LOAD_UBYTE VGPR{{[0-9]+}},
+; SI-CHECK-LABEL: @load_const_i8_aligned
+; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
 define void @load_const_i8_aligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = load i8 addrspace(2)* %in
@@ -180,10 +354,10 @@ entry:
 }
 
 ; Load an un-aligned i8 value
-; R600-CHECK: @load_const_i8_unaligned
+; R600-CHECK-LABEL: @load_const_i8_unaligned
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @load_const_i8_unaligned
-; SI-CHECK: BUFFER_LOAD_UBYTE VGPR{{[0-9]+}},
+; SI-CHECK-LABEL: @load_const_i8_unaligned
+; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
 define void @load_const_i8_unaligned(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %0 = getelementptr i8 addrspace(2)* %in, i32 1
@@ -194,13 +368,13 @@ entry:
 }
 
 ; Load a sign-extended i16 value
-; R600-CHECK: @load_const_i16_sext
+; R600-CHECK-LABEL: @load_const_i16_sext
 ; R600-CHECK: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 16
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 16
-; SI-CHECK: @load_const_i16_sext
+; SI-CHECK-LABEL: @load_const_i16_sext
 ; SI-CHECK: BUFFER_LOAD_SSHORT
 define void @load_const_i16_sext(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
@@ -211,9 +385,9 @@ entry:
 }
 
 ; Load an aligned i16 value
-; R600-CHECK: @load_const_i16_aligned
+; R600-CHECK-LABEL: @load_const_i16_aligned
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @load_const_i16_aligned
+; SI-CHECK-LABEL: @load_const_i16_aligned
 ; SI-CHECK: BUFFER_LOAD_USHORT
 define void @load_const_i16_aligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
@@ -224,9 +398,9 @@ entry:
 }
 
 ; Load an un-aligned i16 value
-; R600-CHECK: @load_const_i16_unaligned
+; R600-CHECK-LABEL: @load_const_i16_unaligned
 ; R600-CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @load_const_i16_unaligned
+; SI-CHECK-LABEL: @load_const_i16_unaligned
 ; SI-CHECK: BUFFER_LOAD_USHORT
 define void @load_const_i16_unaligned(i32 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
@@ -238,11 +412,11 @@ entry:
 }
 
 ; Load an i32 value from the constant address space.
-; R600-CHECK: @load_const_addrspace_i32
+; R600-CHECK-LABEL: @load_const_addrspace_i32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: @load_const_addrspace_i32
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]+}}
+; SI-CHECK-LABEL: @load_const_addrspace_i32
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]+}}
 define void @load_const_addrspace_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %0 = load i32 addrspace(2)* %in
@@ -251,14 +425,259 @@ entry:
 }
 
 ; Load a f32 value from the constant address space.
-; R600-CHECK: @load_const_addrspace_f32
+; R600-CHECK-LABEL: @load_const_addrspace_f32
 ; R600-CHECK: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 
-; SI-CHECK: @load_const_addrspace_f32
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]+}}
+; SI-CHECK-LABEL: @load_const_addrspace_f32
+; SI-CHECK: S_LOAD_DWORD s{{[0-9]+}}
 define void @load_const_addrspace_f32(float addrspace(1)* %out, float addrspace(2)* %in) {
   %1 = load float addrspace(2)* %in
   store float %1, float addrspace(1)* %out
   ret void
 }
 
+;===------------------------------------------------------------------------===;
+; LOCAL ADDRESS SPACE
+;===------------------------------------------------------------------------===;
+
+; Load an i8 value from the local address space.
+; R600-CHECK-LABEL: @load_i8_local
+; R600-CHECK: LDS_UBYTE_READ_RET
+; SI-CHECK-LABEL: @load_i8_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U8
+define void @load_i8_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
+  %1 = load i8 addrspace(3)* %in
+  %2 = zext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_i8_sext_local
+; R600-CHECK: LDS_UBYTE_READ_RET
+; R600-CHECK: ASHR
+; SI-CHECK-LABEL: @load_i8_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I8
+define void @load_i8_sext_local(i32 addrspace(1)* %out, i8 addrspace(3)* %in) {
+entry:
+  %0 = load i8 addrspace(3)* %in
+  %1 = sext i8 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i8_local
+; R600-CHECK: LDS_UBYTE_READ_RET
+; R600-CHECK: LDS_UBYTE_READ_RET
+; SI-CHECK-LABEL: @load_v2i8_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U8
+; SI-CHECK: DS_READ_U8
+define void @load_v2i8_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i8> addrspace(3)* %in
+  %1 = zext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i8_sext_local
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; SI-CHECK-LABEL: @load_v2i8_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I8
+; SI-CHECK: DS_READ_I8
+define void @load_v2i8_sext_local(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i8> addrspace(3)* %in
+  %1 = sext <2 x i8> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i8_local
+; R600-CHECK: LDS_UBYTE_READ_RET
+; R600-CHECK: LDS_UBYTE_READ_RET
+; R600-CHECK: LDS_UBYTE_READ_RET
+; R600-CHECK: LDS_UBYTE_READ_RET
+; SI-CHECK-LABEL: @load_v4i8_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U8
+; SI-CHECK: DS_READ_U8
+; SI-CHECK: DS_READ_U8
+; SI-CHECK: DS_READ_U8
+define void @load_v4i8_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i8> addrspace(3)* %in
+  %1 = zext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i8_sext_local
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: LDS_UBYTE_READ_RET
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; SI-CHECK-LABEL: @load_v4i8_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I8
+; SI-CHECK: DS_READ_I8
+; SI-CHECK: DS_READ_I8
+; SI-CHECK: DS_READ_I8
+define void @load_v4i8_sext_local(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i8> addrspace(3)* %in
+  %1 = sext <4 x i8> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; Load an i16 value from the local address space.
+; R600-CHECK-LABEL: @load_i16_local
+; R600-CHECK: LDS_USHORT_READ_RET
+; SI-CHECK-LABEL: @load_i16_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U16
+define void @load_i16_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
+entry:
+  %0 = load i16	 addrspace(3)* %in
+  %1 = zext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_i16_sext_local
+; R600-CHECK: LDS_USHORT_READ_RET
+; R600-CHECK: ASHR
+; SI-CHECK-LABEL: @load_i16_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I16
+define void @load_i16_sext_local(i32 addrspace(1)* %out, i16 addrspace(3)* %in) {
+entry:
+  %0 = load i16 addrspace(3)* %in
+  %1 = sext i16 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i16_local
+; R600-CHECK: LDS_USHORT_READ_RET
+; R600-CHECK: LDS_USHORT_READ_RET
+; SI-CHECK-LABEL: @load_v2i16_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U16
+; SI-CHECK: DS_READ_U16
+define void @load_v2i16_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i16> addrspace(3)* %in
+  %1 = zext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v2i16_sext_local
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; SI-CHECK-LABEL: @load_v2i16_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I16
+; SI-CHECK: DS_READ_I16
+define void @load_v2i16_sext_local(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x i16> addrspace(3)* %in
+  %1 = sext <2 x i16> %0 to <2 x i32>
+  store <2 x i32> %1, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i16_local
+; R600-CHECK: LDS_USHORT_READ_RET
+; R600-CHECK: LDS_USHORT_READ_RET
+; R600-CHECK: LDS_USHORT_READ_RET
+; R600-CHECK: LDS_USHORT_READ_RET
+; SI-CHECK-LABEL: @load_v4i16_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_U16
+; SI-CHECK: DS_READ_U16
+; SI-CHECK: DS_READ_U16
+; SI-CHECK: DS_READ_U16
+define void @load_v4i16_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i16> addrspace(3)* %in
+  %1 = zext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; R600-CHECK-LABEL: @load_v4i16_sext_local
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: LDS_USHORT_READ_RET
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; R600-CHECK-DAG: ASHR
+; SI-CHECK-LABEL: @load_v4i16_sext_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_I16
+; SI-CHECK: DS_READ_I16
+; SI-CHECK: DS_READ_I16
+; SI-CHECK: DS_READ_I16
+define void @load_v4i16_sext_local(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(3)* %in) {
+entry:
+  %0 = load <4 x i16> addrspace(3)* %in
+  %1 = sext <4 x i16> %0 to <4 x i32>
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; load an i32 value from the glocal address space.
+; R600-CHECK-LABEL: @load_i32_local
+; R600-CHECK: LDS_READ_RET
+; SI-CHECK-LABEL: @load_i32_local
+; SI-CHECK-NOT: S_WQM_B64
+; SI-CHECK: DS_READ_B32
+define void @load_i32_local(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+entry:
+  %0 = load i32 addrspace(3)* %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; load a f32 value from the global address space.
+; R600-CHECK-LABEL: @load_f32_local
+; R600-CHECK: LDS_READ_RET
+; SI-CHECK-LABEL: @load_f32_local
+; SI-CHECK: DS_READ_B32
+define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) {
+entry:
+  %0 = load float addrspace(3)* %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
+; load a v2f32 value from the local address space
+; R600-CHECK-LABEL: @load_v2f32_local
+; R600-CHECK: LDS_READ_RET
+; R600-CHECK: LDS_READ_RET
+; SI-CHECK-LABEL: @load_v2f32_local
+; SI-CHECK: DS_READ_B32
+; SI-CHECK: DS_READ_B32
+define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) {
+entry:
+  %0 = load <2 x float> addrspace(3)* %in
+  store <2 x float> %0, <2 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/load.vec.ll b/test/CodeGen/R600/load.vec.ll
index 8cba0b6..81a6310 100644
--- a/test/CodeGen/R600/load.vec.ll
+++ b/test/CodeGen/R600/load.vec.ll
@@ -1,11 +1,11 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK  %s
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK  %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK  %s
 
 ; load a v2i32 value from the global address space.
 ; EG-CHECK: @load_v2i32
 ; EG-CHECK: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0
 ; SI-CHECK: @load_v2i32
-; SI-CHECK: BUFFER_LOAD_DWORDX2 VGPR{{[0-9]+}}
+; SI-CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
 define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %a = load <2 x i32> addrspace(1) * %in
   store <2 x i32> %a, <2 x i32> addrspace(1)* %out
@@ -16,7 +16,7 @@ define void @load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; EG-CHECK: @load_v4i32
 ; EG-CHECK: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0
 ; SI-CHECK: @load_v4i32
-; SI-CHECK: BUFFER_LOAD_DWORDX4 VGPR{{[0-9]+}}
+; SI-CHECK: BUFFER_LOAD_DWORDX4 v[{{[0-9]+:[0-9]+}}]
 define void @load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %a = load <4 x i32> addrspace(1) * %in
   store <4 x i32> %a, <4 x i32> addrspace(1)* %out
diff --git a/test/CodeGen/R600/load64.ll b/test/CodeGen/R600/load64.ll
index 3b4a8f8..e351e41 100644
--- a/test/CodeGen/R600/load64.ll
+++ b/test/CodeGen/R600/load64.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -march=r600 -mcpu=tahiti | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s
 
 ; load a f64 value from the global address space.
 ; CHECK: @load_f64
-; CHECK: BUFFER_LOAD_DWORDX2 VGPR{{[0-9]+}}
+; CHECK: BUFFER_LOAD_DWORDX2 v[{{[0-9]+:[0-9]+}}]
 define void @load_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 entry:
   %0 = load double addrspace(1)* %in
@@ -12,7 +12,7 @@ entry:
 
 ; Load a f64 value from the constant address space.
 ; CHECK: @load_const_addrspace_f64
-; CHECK: S_LOAD_DWORDX2 SGPR{{[0-9]+}}
+; CHECK: S_LOAD_DWORDX2 s[{{[0-9]+:[0-9]+}}]
 define void @load_const_addrspace_f64(double addrspace(1)* %out, double addrspace(2)* %in) {
   %1 = load double addrspace(2)* %in
   store double %1, double addrspace(1)* %out
diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll
index 6d3610e..e2d8406 100644
--- a/test/CodeGen/R600/local-memory-two-objects.ll
+++ b/test/CodeGen/R600/local-memory-two-objects.ll
@@ -1,27 +1,34 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
-
-; TODO: Add RUN and CHECK lines for SI once this test works there
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 @local_memory_two_objects.local_mem0 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
 @local_memory_two_objects.local_mem1 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4
 
-; CHECK: @local_memory_two_objects
+; EG-CHECK: @local_memory_two_objects
 
 ; Check that the LDS size emitted correctly
-; CHECK: .long 166120
-; CHECK-NEXT: .long 8
-
-; Make sure the lds writes are using different addresses.
-; CHECK: LDS_WRITE {{[*]*}} {{PV|T}}[[ADDRW:[0-9]*\.[XYZW]]]
-; CHECK-NOT: LDS_WRITE {{[*]*}} T[[ADDRW]]
+; EG-CHECK: .long 166120
+; EG-CHECK-NEXT: .long 8
+; SI-CHECK: .long 47180
+; SI-CHECK-NEXT: .long 32768
+
+; We would like to check the the lds writes are using different
+; addresses, but due to variations in the scheduler, we can't do
+; this consistently on evergreen GPUs.
+; EG-CHECK: LDS_WRITE
+; EG-CHECK: LDS_WRITE
+; SI-CHECK: DS_WRITE_B32 0, {{v[0-9]*}}, v[[ADDRW:[0-9]*]]
+; SI-CHECK-NOT: DS_WRITE_B32 0, {{v[0-9]*}}, v[[ADDRW]]
 
 ; GROUP_BARRIER must be the last instruction in a clause
-; CHECK: GROUP_BARRIER
-; CHECK-NEXT: ALU clause
+; EG-CHECK: GROUP_BARRIER
+; EG-CHECK-NEXT: ALU clause
 
 ; Make sure the lds reads are using different addresses.
-; CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
-; CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
+; EG-CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
+; EG-CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
+; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, 0, [[ADDRR:v[0-9]+]]
+; SI-CHECK-NOT: DS_READ_B32 {{v[0-9]+}}, 0, [[ADDRR]]
 
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:
diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll
index 5458fb9..2168a3d 100644
--- a/test/CodeGen/R600/local-memory.ll
+++ b/test/CodeGen/R600/local-memory.ll
@@ -1,19 +1,24 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=bonaire -verify-machineinstrs | FileCheck --check-prefix=CI-CHECK %s
 
-@local_memory.local_mem = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 4
+@local_memory.local_mem = internal addrspace(3) unnamed_addr global [128 x i32] zeroinitializer, align 4
 
-; EG-CHECK: @local_memory
-; SI-CHECK: @local_memory
+; EG-CHECK-LABEL: @local_memory
+; SI-CHECK-LABEL: @local_memory
+; CI-CHECK-LABEL: @local_memory
 
 ; Check that the LDS size emitted correctly
 ; EG-CHECK: .long 166120
-; EG-CHECK-NEXT: .long 16
+; EG-CHECK-NEXT: .long 128
 ; SI-CHECK: .long 47180
-; SI-CHECK-NEXT: .long 32768
+; SI-CHECK-NEXT: .long 65536
+; CI-CHECK: .long 47180
+; CI-CHECK-NEXT: .long 32768
 
 ; EG-CHECK: LDS_WRITE
-; SI-CHECK: DS_WRITE_B32
+; SI-CHECK_NOT: S_WQM_B64
+; SI-CHECK: DS_WRITE_B32 0
 
 ; GROUP_BARRIER must be the last instruction in a clause
 ; EG-CHECK: GROUP_BARRIER
@@ -21,18 +26,18 @@
 ; SI-CHECK: S_BARRIER
 
 ; EG-CHECK: LDS_READ_RET
-; SI-CHECK: DS_READ_B32
+; SI-CHECK: DS_READ_B32 {{v[0-9]+}}, 0
 
 define void @local_memory(i32 addrspace(1)* %out) {
 entry:
   %y.i = call i32 @llvm.r600.read.tidig.x() #0
-  %arrayidx = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
+  %arrayidx = getelementptr inbounds [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
   store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
   %add = add nsw i32 %y.i, 1
   %cmp = icmp eq i32 %add, 16
   %.add = select i1 %cmp, i32 0, i32 %add
   call void @llvm.AMDGPU.barrier.local()
-  %arrayidx1 = getelementptr inbounds [16 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
+  %arrayidx1 = getelementptr inbounds [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
   %0 = load i32 addrspace(3)* %arrayidx1, align 4
   %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %out, i32 %y.i
   store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
diff --git a/test/CodeGen/R600/lshl.ll b/test/CodeGen/R600/lshl.ll
index 806e681..2162839 100644
--- a/test/CodeGen/R600/lshl.ll
+++ b/test/CodeGen/R600/lshl.ll
@@ -1,6 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_LSHL_B32_e64 VGPR{{[0-9]}}, SGPR{{[0-9]}}, 1
+;CHECK: S_LSHL_B32 s{{[0-9]}}, s{{[0-9]}}, 1
 
 define void @test(i32 %p) {
    %i = mul i32 %p, 2
diff --git a/test/CodeGen/R600/lshr.ll b/test/CodeGen/R600/lshr.ll
index cfbcc34..886d1c4 100644
--- a/test/CodeGen/R600/lshr.ll
+++ b/test/CodeGen/R600/lshr.ll
@@ -1,6 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_LSHR_B32_e64 {{VGPR[0-9]}}, SGPR{{[0-9]}}, 1
+;CHECK: S_LSHR_B32 s{{[0-9]}}, s{{[0-9]}}, 1
 
 define void @test(i32 %p) {
    %i = udiv i32 %p, 2
diff --git a/test/CodeGen/R600/mad_int24.ll b/test/CodeGen/R600/mad_int24.ll
index ce42ae7..df063ec 100644
--- a/test/CodeGen/R600/mad_int24.ll
+++ b/test/CodeGen/R600/mad_int24.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; EG-CHECK: @i32_mad24
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
diff --git a/test/CodeGen/R600/mad_uint24.ll b/test/CodeGen/R600/mad_uint24.ll
index 00aa64a..66a070e 100644
--- a/test/CodeGen/R600/mad_uint24.ll
+++ b/test/CodeGen/R600/mad_uint24.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; EG-CHECK: @u32_mad24
+; EG-CHECK-LABEL: @u32_mad24
 ; EG-CHECK: MULADD_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W, KC0[3].X
-; SI-CHECK: @u32_mad24
+; SI-CHECK-LABEL: @u32_mad24
 ; SI-CHECK: V_MAD_U32_U24
 
 define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
@@ -19,7 +19,7 @@ entry:
   ret void
 }
 
-; EG-CHECK: @i16_mad24
+; EG-CHECK-LABEL: @i16_mad24
 ; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
 ; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
 ; EG-CHECK-DAG: VTX_READ_16 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
@@ -30,10 +30,10 @@ entry:
 ; EG-CHECK: 16
 ; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
 ; EG-CHECK: 16
-; SI-CHECK: @i16_mad24
-; SI-CHECK: V_MAD_U32_U24 [[MAD:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 16, [[MAD]]
-; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 16, [[LSHL]]
+; SI-CHECK-LABEL: @i16_mad24
+; SI-CHECK: V_MAD_U32_U24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 16, [[MAD]]
+; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 16, [[LSHL]]
 
 define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
@@ -44,7 +44,7 @@ entry:
   ret void
 }
 
-; EG-CHECK: @i8_mad24
+; EG-CHECK-LABEL: @i8_mad24
 ; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
 ; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
 ; EG-CHECK-DAG: VTX_READ_8 [[C:T[0-9]\.X]], T{{[0-9]}}.X, 48
@@ -55,10 +55,10 @@ entry:
 ; EG-CHECK: 24
 ; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
 ; EG-CHECK: 24
-; SI-CHECK: @i8_mad24
-; SI-CHECK: V_MAD_U32_U24 [[MUL:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 24, [[MUL]]
-; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 24, [[LSHL]]
+; SI-CHECK-LABEL: @i8_mad24
+; SI-CHECK: V_MAD_U32_U24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 24, [[MUL]]
+; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 24, [[LSHL]]
 
 define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
diff --git a/test/CodeGen/R600/max-literals.ll b/test/CodeGen/R600/max-literals.ll
index c31b7c0..65a6d2b 100644
--- a/test/CodeGen/R600/max-literals.ll
+++ b/test/CodeGen/R600/max-literals.ll
@@ -3,13 +3,13 @@
 ; CHECK: @main
 ; CHECK: ADD *
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
-  %4 = call float @llvm.R600.load.input(i32 8)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
   %5 = fadd float %0, 2.0
   %6 = fadd float %1, 3.0
   %7 = fadd float %2, 4.0
@@ -32,13 +32,13 @@ main_body:
 ; CHECK: @main
 ; CHECK-NOT: ADD *
 
-define void @main2() #0 {
+define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
-  %4 = call float @llvm.R600.load.input(i32 8)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
   %5 = fadd float %0, 2.0
   %6 = fadd float %1, 3.0
   %7 = fadd float %2, 4.0
@@ -59,7 +59,6 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
 
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
index 18a17b6..8c27e28 100644
--- a/test/CodeGen/R600/mul.ll
+++ b/test/CodeGen/R600/mul.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ; mul24 and mad24 are affected
 
@@ -8,8 +8,8 @@
 ;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test2
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -27,10 +27,10 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ;EG-CHECK: MULLO_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test4
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_MUL_LO_I32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_MUL_LO_I32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/mul_int24.ll b/test/CodeGen/R600/mul_int24.ll
index 16ae760..66a1a9e 100644
--- a/test/CodeGen/R600/mul_int24.ll
+++ b/test/CodeGen/R600/mul_int24.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; EG-CHECK: @i32_mul24
 ; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
diff --git a/test/CodeGen/R600/mul_uint24.ll b/test/CodeGen/R600/mul_uint24.ll
index b1a7f94..6e6d549 100644
--- a/test/CodeGen/R600/mul_uint24.ll
+++ b/test/CodeGen/R600/mul_uint24.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; EG-CHECK: @u32_mul24
+; EG-CHECK-LABEL: @u32_mul24
 ; EG-CHECK: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-; SI-CHECK: @u32_mul24
+; SI-CHECK-LABEL: @u32_mul24
 ; SI-CHECK: V_MUL_U32_U24
 
 define void @u32_mul24(i32 addrspace(1)* %out, i32 %a, i32 %b) {
@@ -18,7 +18,7 @@ entry:
   ret void
 }
 
-; EG-CHECK: @i16_mul24
+; EG-CHECK-LABEL: @i16_mul24
 ; EG-CHECK-DAG: VTX_READ_16 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
 ; EG-CHECK-DAG: VTX_READ_16 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
 ; The order of A and B does not matter.
@@ -28,10 +28,10 @@ entry:
 ; EG-CHECK: 16
 ; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
 ; EG-CHECK: 16
-; SI-CHECK: @i16_mul24
-; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 16, [[MUL]]
-; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 16, [[LSHL]]
+; SI-CHECK-LABEL: @i16_mul24
+; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 16, [[MUL]]
+; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 16, [[LSHL]]
 
 define void @i16_mul24(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
@@ -41,7 +41,7 @@ entry:
   ret void
 }
 
-; EG-CHECK: @i8_mul24
+; EG-CHECK-LABEL: @i8_mul24
 ; EG-CHECK-DAG: VTX_READ_8 [[A:T[0-9]\.X]], T{{[0-9]}}.X, 40
 ; EG-CHECK-DAG: VTX_READ_8 [[B:T[0-9]\.X]], T{{[0-9]}}.X, 44
 ; The order of A and B does not matter.
@@ -51,10 +51,10 @@ entry:
 ; EG-CHECK: 24
 ; EG-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]], literal.x
 ; EG-CHECK: 24
-; SI-CHECK: @i8_mul24
-; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:VGPR[0-9]]], {{[SV]GPR[0-9], [SV]GPR[0-9]}}
-; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:VGPR[0-9]]], 24, [[MUL]]
-; SI-CHECK: V_ASHRREV_I32_e32 VGPR{{[0-9]}}, 24, [[LSHL]]
+; SI-CHECK-LABEL: @i8_mul24
+; SI-CHECK: V_MUL_U32_U24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; SI-CHECK: V_LSHLREV_B32_e32 [[LSHL:v[0-9]]], 24, [[MUL]]
+; SI-CHECK: V_ASHRREV_I32_e32 v{{[0-9]}}, 24, [[LSHL]]
 
 define void @i8_mul24(i32 addrspace(1)* %out, i8 %a, i8 %b) {
 entry:
diff --git a/test/CodeGen/R600/mulhu.ll b/test/CodeGen/R600/mulhu.ll
index eb379d1..d5fc014 100644
--- a/test/CodeGen/R600/mulhu.ll
+++ b/test/CodeGen/R600/mulhu.ll
@@ -1,8 +1,8 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_MOV_B32_e32 VGPR{{[0-9]+}}, -1431655765
-;CHECK: V_MUL_HI_U32 VGPR0, {{[SV]GPR[0-9]+}}, {{VGPR[0-9]+}}
-;CHECK-NEXT: V_LSHRREV_B32_e32 VGPR0, 1, VGPR0
+;CHECK: V_MOV_B32_e32 v{{[0-9]+}}, -1431655765
+;CHECK: V_MUL_HI_U32 v0, {{[sv][0-9]+}}, {{v[0-9]+}}
+;CHECK-NEXT: V_LSHRREV_B32_e32 v0, 1, v0
 
 define void @test(i32 %p) {
    %i = udiv i32 %p, 3
diff --git a/test/CodeGen/R600/or.ll b/test/CodeGen/R600/or.ll
index 4a4e892..35d23b3 100644
--- a/test/CodeGen/R600/or.ll
+++ b/test/CodeGen/R600/or.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-; EG-CHECK: @or_v2i32
+; EG-CHECK-LABEL: @or_v2i32
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @or_v2i32
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @or_v2i32
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +18,17 @@ define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in)
   ret void
 }
 
-; EG-CHECK: @or_v4i32
+; EG-CHECK-LABEL: @or_v4i32
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG-CHECK: OR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @or_v4i32
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_OR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @or_v4i32
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_OR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -38,3 +38,16 @@ define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in)
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+; EG-CHECK-LABEL: @or_i64
+; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; EG-CHECK-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; SI-CHECK-LABEL: @or_i64
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
+; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}
+define void @or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+	%0 = or i64 %a, %b
+	store i64 %0, i64 addrspace(1)* %out
+	ret void
+}
diff --git a/test/CodeGen/R600/predicate-dp4.ll b/test/CodeGen/R600/predicate-dp4.ll
new file mode 100644
index 0000000..e48d6a7
--- /dev/null
+++ b/test/CodeGen/R600/predicate-dp4.ll
@@ -0,0 +1,27 @@
+;RUN: llc < %s -march=r600 -mcpu=cayman
+
+; CHECK-LABEL: @main
+; CHECK: PRED_SETE_INT * Pred,
+; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one
+define void @main(<4 x float> inreg) #0 {
+main_body:
+  %1 = extractelement <4 x float> %0, i32 0
+  %2 = bitcast float %1 to i32
+  %3 = icmp eq i32 %2, 0
+  br i1 %3, label %IF, label %ENDIF
+
+IF:                                             ; preds = %main_body
+  %4 = call float @llvm.AMDGPU.dp4(<4 x float> %0, <4 x float> %0)
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %IF, %main_body
+  %5 = phi float [%4, %IF], [0.000000e+00, %main_body]
+  %6 = insertelement <4 x float> undef, float %5, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %6, i32 0, i32 0)
+  ret void
+}
+
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+attributes #1 = { readnone }
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/predicates.ll b/test/CodeGen/R600/predicates.ll
index 0d3eeef..902508f 100644
--- a/test/CodeGen/R600/predicates.ll
+++ b/test/CodeGen/R600/predicates.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mattr=disable-irstructurizer -mcpu=redwood | FileCheck %s
 
 ; These tests make sure the compiler is optimizing branches using predicates
 ; when it is legal to do so.
diff --git a/test/CodeGen/R600/indirect-addressing.ll b/test/CodeGen/R600/private-memory.ll
index bd72cd9..48a013c 100644
--- a/test/CodeGen/R600/indirect-addressing.ll
+++ b/test/CodeGen/R600/private-memory.ll
@@ -1,16 +1,24 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
 
 ; This test checks that uses and defs of the AR register happen in the same
 ; instruction clause.
 
-; CHECK: @mova_same_clause
-; CHECK: MOVA_INT
-; CHECK-NOT: ALU clause
-; CHECK: 0 + AR.x
-; CHECK: MOVA_INT
-; CHECK-NOT: ALU clause
-; CHECK: 0 + AR.x
+; R600-CHECK-LABEL: @mova_same_clause
+; R600-CHECK: MOVA_INT
+; R600-CHECK-NOT: ALU clause
+; R600-CHECK: 0 + AR.x
+; R600-CHECK: MOVA_INT
+; R600-CHECK-NOT: ALU clause
+; R600-CHECK: 0 + AR.x
 
+; SI-CHECK-LABEL: @mova_same_clause
+; SI-CHECK: V_READFIRSTLANE
+; SI-CHECK: V_MOVRELD
+; SI-CHECK: S_CBRANCH
+; SI-CHECK: V_READFIRSTLANE
+; SI-CHECK: V_MOVRELD
+; SI-CHECK: S_CBRANCH
 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
@@ -38,9 +46,10 @@ entry:
 ; XXX: This generated code has unnecessary MOVs, we should be able to optimize
 ; this.
 
-; CHECK: @multiple_structs
-; CHECK-NOT: MOVA_INT
-
+; R600-CHECK-LABEL: @multiple_structs
+; R600-CHECK-NOT: MOVA_INT
+; SI-CHECK-LABEL: @multiple_structs
+; SI-CHECK-NOT: V_MOVREL
 %struct.point = type { i32, i32 }
 
 define void @multiple_structs(i32 addrspace(1)* %out) {
@@ -63,3 +72,44 @@ entry:
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
+
+; Test direct access of a private array inside a loop.  The private array
+; loads and stores should be lowered to copies, so there shouldn't be any
+; MOVA instructions.
+
+; R600-CHECK-LABLE: @direct_loop
+; R600-CHECK-NOT: MOVA_INT
+; SI-CHECK-LABEL: @direct_loop
+; SI-CHECK-NOT: V_MOVREL
+
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %prv_array_const = alloca [2 x i32]
+  %prv_array = alloca [2 x i32]
+  %a = load i32 addrspace(1)* %in
+  %b_src_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %b = load i32 addrspace(1)* %b_src_ptr
+  %a_dst_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 0
+  store i32 %a, i32* %a_dst_ptr
+  %b_dst_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 1
+  store i32 %b, i32* %b_dst_ptr
+  br label %for.body
+
+for.body:
+  %inc = phi i32 [0, %entry], [%count, %for.body]
+  %x_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 0
+  %x = load i32* %x_ptr
+  %y_ptr = getelementptr [2 x i32]* %prv_array, i32 0, i32 0
+  %y = load i32* %y_ptr
+  %xy = add i32 %x, %y
+  store i32 %xy, i32* %y_ptr
+  %count = add i32 %inc, 1
+  %done = icmp eq i32 %count, 4095
+  br i1 %done, label %for.end, label %for.body
+
+for.end:
+  %value_ptr = getelementptr [2 x i32]* %prv_array, i32 0, i32 0
+  %value = load i32* %value_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/pv-packing.ll b/test/CodeGen/R600/pv-packing.ll
index 03fc204..e5615b9 100644
--- a/test/CodeGen/R600/pv-packing.ll
+++ b/test/CodeGen/R600/pv-packing.ll
@@ -3,17 +3,17 @@
 ;CHECK: DOT4  T{{[0-9]\.X}}
 ;CHECK: MULADD_IEEE * T{{[0-9]\.W}}
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 8)
-  %4 = call float @llvm.R600.load.input(i32 9)
-  %5 = call float @llvm.R600.load.input(i32 10)
-  %6 = call float @llvm.R600.load.input(i32 12)
-  %7 = call float @llvm.R600.load.input(i32 13)
-  %8 = call float @llvm.R600.load.input(i32 14)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg2, i32 0
+  %4 = extractelement <4 x float> %reg2, i32 1
+  %5 = extractelement <4 x float> %reg2, i32 2
+  %6 = extractelement <4 x float> %reg3, i32 0
+  %7 = extractelement <4 x float> %reg3, i32 1
+  %8 = extractelement <4 x float> %reg3, i32 2
   %9 = load <4 x float> addrspace(8)* null
   %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %11 = call float @llvm.AMDGPU.dp4(<4 x float> %9, <4 x float> %9)
@@ -36,9 +36,6 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
-; Function Attrs: readnone
 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
 
 
@@ -46,5 +43,3 @@ declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { "ShaderType"="1" }
 attributes #1 = { readnone }
-attributes #2 = { readonly }
-attributes #3 = { nounwind readonly }
diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
index 6e0b744..5a930b2 100644
--- a/test/CodeGen/R600/pv.ll
+++ b/test/CodeGen/R600/pv.ll
@@ -1,38 +1,38 @@
 ; RUN: llc < %s -march=r600 | FileCheck %s
 
 ;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
-;CHECK: CNDGE T{{[0-9].[XYZW]}}, PV.X
+;CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
-  %4 = call float @llvm.R600.load.input(i32 8)
-  %5 = call float @llvm.R600.load.input(i32 9)
-  %6 = call float @llvm.R600.load.input(i32 10)
-  %7 = call float @llvm.R600.load.input(i32 11)
-  %8 = call float @llvm.R600.load.input(i32 12)
-  %9 = call float @llvm.R600.load.input(i32 13)
-  %10 = call float @llvm.R600.load.input(i32 14)
-  %11 = call float @llvm.R600.load.input(i32 15)
-  %12 = call float @llvm.R600.load.input(i32 16)
-  %13 = call float @llvm.R600.load.input(i32 17)
-  %14 = call float @llvm.R600.load.input(i32 18)
-  %15 = call float @llvm.R600.load.input(i32 19)
-  %16 = call float @llvm.R600.load.input(i32 20)
-  %17 = call float @llvm.R600.load.input(i32 21)
-  %18 = call float @llvm.R600.load.input(i32 22)
-  %19 = call float @llvm.R600.load.input(i32 23)
-  %20 = call float @llvm.R600.load.input(i32 24)
-  %21 = call float @llvm.R600.load.input(i32 25)
-  %22 = call float @llvm.R600.load.input(i32 26)
-  %23 = call float @llvm.R600.load.input(i32 27)
-  %24 = call float @llvm.R600.load.input(i32 28)
-  %25 = call float @llvm.R600.load.input(i32 29)
-  %26 = call float @llvm.R600.load.input(i32 30)
-  %27 = call float @llvm.R600.load.input(i32 31)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = extractelement <4 x float> %reg2, i32 0
+  %5 = extractelement <4 x float> %reg2, i32 1
+  %6 = extractelement <4 x float> %reg2, i32 2
+  %7 = extractelement <4 x float> %reg2, i32 3
+  %8 = extractelement <4 x float> %reg3, i32 0
+  %9 = extractelement <4 x float> %reg3, i32 1
+  %10 = extractelement <4 x float> %reg3, i32 2
+  %11 = extractelement <4 x float> %reg3, i32 3
+  %12 = extractelement <4 x float> %reg4, i32 0
+  %13 = extractelement <4 x float> %reg4, i32 1
+  %14 = extractelement <4 x float> %reg4, i32 2
+  %15 = extractelement <4 x float> %reg4, i32 3
+  %16 = extractelement <4 x float> %reg5, i32 0
+  %17 = extractelement <4 x float> %reg5, i32 1
+  %18 = extractelement <4 x float> %reg5, i32 2
+  %19 = extractelement <4 x float> %reg5, i32 3
+  %20 = extractelement <4 x float> %reg6, i32 0
+  %21 = extractelement <4 x float> %reg6, i32 1
+  %22 = extractelement <4 x float> %reg6, i32 2
+  %23 = extractelement <4 x float> %reg6, i32 3
+  %24 = extractelement <4 x float> %reg7, i32 0
+  %25 = extractelement <4 x float> %reg7, i32 1
+  %26 = extractelement <4 x float> %reg7, i32 2
+  %27 = extractelement <4 x float> %reg7, i32 3
   %28 = load <4 x float> addrspace(8)* null
   %29 = extractelement <4 x float> %28, i32 0
   %30 = fmul float %0, %29
@@ -219,9 +219,6 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
-; Function Attrs: readnone
 declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
 
 ; Function Attrs: readonly
diff --git a/test/CodeGen/R600/r600-encoding.ll b/test/CodeGen/R600/r600-encoding.ll
index 6ef3c31..b760c88 100644
--- a/test/CodeGen/R600/r600-encoding.ll
+++ b/test/CodeGen/R600/r600-encoding.ll
@@ -10,15 +10,16 @@
 ; R600-CHECK: @test
 ; R600-CHECK: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
 
-define void @test() {
+define void @test(<4 x float> inreg %reg0) #0 {
 entry:
-  %0 = call float @llvm.R600.load.input(i32 0)
-  %1 = call float @llvm.R600.load.input(i32 1)
-  %2 = fmul float %0, %1
-  call void @llvm.AMDGPU.store.output(float %2, i32 0)
+  %r0 = extractelement <4 x float> %reg0, i32 0
+  %r1 = extractelement <4 x float> %reg0, i32 1
+  %r2 = fmul float %r0, %r1
+  %vec = insertelement <4 x float> undef, float %r2, i32 0
+  call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
   ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare void @llvm.AMDGPU.store.output(float, i32)
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/r600-export-fix.ll b/test/CodeGen/R600/r600-export-fix.ll
new file mode 100644
index 0000000..73bc063
--- /dev/null
+++ b/test/CodeGen/R600/r600-export-fix.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=r600 -mcpu=cedar | FileCheck %s
+
+;CHECK:	EXPORT T{{[0-9]}}.XYZW
+;CHECK:	EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0XZW
+;CHECK: EXPORT T{{[0-9]}}.XYZW
+;CHECK: EXPORT T{{[0-9]}}.YX00
+;CHECK: EXPORT T{{[0-9]}}.0000
+;CHECK: EXPORT T{{[0-9]}}.0000
+
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+main_body:
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
+  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = fmul float %5, %0
+  %7 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %8 = extractelement <4 x float> %7, i32 1
+  %9 = fmul float %8, %0
+  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %11 = extractelement <4 x float> %10, i32 2
+  %12 = fmul float %11, %0
+  %13 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %14 = extractelement <4 x float> %13, i32 3
+  %15 = fmul float %14, %0
+  %16 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %17 = extractelement <4 x float> %16, i32 0
+  %18 = fmul float %17, %1
+  %19 = fadd float %18, %6
+  %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %21 = extractelement <4 x float> %20, i32 1
+  %22 = fmul float %21, %1
+  %23 = fadd float %22, %9
+  %24 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %25 = extractelement <4 x float> %24, i32 2
+  %26 = fmul float %25, %1
+  %27 = fadd float %26, %12
+  %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %29 = extractelement <4 x float> %28, i32 3
+  %30 = fmul float %29, %1
+  %31 = fadd float %30, %15
+  %32 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %33 = extractelement <4 x float> %32, i32 0
+  %34 = fmul float %33, %2
+  %35 = fadd float %34, %19
+  %36 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %37 = extractelement <4 x float> %36, i32 1
+  %38 = fmul float %37, %2
+  %39 = fadd float %38, %23
+  %40 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %41 = extractelement <4 x float> %40, i32 2
+  %42 = fmul float %41, %2
+  %43 = fadd float %42, %27
+  %44 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 6)
+  %45 = extractelement <4 x float> %44, i32 3
+  %46 = fmul float %45, %2
+  %47 = fadd float %46, %31
+  %48 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %49 = extractelement <4 x float> %48, i32 0
+  %50 = fmul float %49, %3
+  %51 = fadd float %50, %35
+  %52 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %53 = extractelement <4 x float> %52, i32 1
+  %54 = fmul float %53, %3
+  %55 = fadd float %54, %39
+  %56 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %57 = extractelement <4 x float> %56, i32 2
+  %58 = fmul float %57, %3
+  %59 = fadd float %58, %43
+  %60 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %61 = extractelement <4 x float> %60, i32 3
+  %62 = fmul float %61, %3
+  %63 = fadd float %62, %47
+  %64 = load <4 x float> addrspace(8)* null
+  %65 = extractelement <4 x float> %64, i32 0
+  %66 = load <4 x float> addrspace(8)* null
+  %67 = extractelement <4 x float> %66, i32 1
+  %68 = load <4 x float> addrspace(8)* null
+  %69 = extractelement <4 x float> %68, i32 2
+  %70 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %71 = extractelement <4 x float> %70, i32 0
+  %72 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %73 = extractelement <4 x float> %72, i32 1
+  %74 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %75 = extractelement <4 x float> %74, i32 2
+  %76 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %77 = extractelement <4 x float> %76, i32 0
+  %78 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %79 = extractelement <4 x float> %78, i32 1
+  %80 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %81 = extractelement <4 x float> %80, i32 2
+  %82 = insertelement <4 x float> undef, float %51, i32 0
+  %83 = insertelement <4 x float> %82, float %55, i32 1
+  %84 = insertelement <4 x float> %83, float %59, i32 2
+  %85 = insertelement <4 x float> %84, float %63, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %85, i32 60, i32 1)
+  %86 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %87 = insertelement <4 x float> %86, float 0.000000e+00, i32 1
+  %88 = insertelement <4 x float> %87, float 0.000000e+00, i32 2
+  %89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %89, i32 0, i32 2)
+  %90 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 1
+  %92 = insertelement <4 x float> %91, float 0.000000e+00, i32 2
+  %93 = insertelement <4 x float> %92, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %93, i32 1, i32 2)
+  %94 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %95 = insertelement <4 x float> %94, float %65, i32 1
+  %96 = insertelement <4 x float> %95, float %67, i32 2
+  %97 = insertelement <4 x float> %96, float %69, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %97, i32 2, i32 2)
+  %98 = insertelement <4 x float> undef, float %77, i32 0
+  %99 = insertelement <4 x float> %98, float %79, i32 1
+  %100 = insertelement <4 x float> %99, float %81, i32 2
+  %101 = insertelement <4 x float> %100, float %71, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %101, i32 3, i32 2)
+  %102 = insertelement <4 x float> undef, float %73, i32 0
+  %103 = insertelement <4 x float> %102, float %75, i32 1
+  %104 = insertelement <4 x float> %103, float 0.000000e+00, i32 2
+  %105 = insertelement <4 x float> %104, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %105, i32 4, i32 2)
+  %106 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %107 = insertelement <4 x float> %106, float 0.000000e+00, i32 1
+  %108 = insertelement <4 x float> %107, float 0.000000e+00, i32 2
+  %109 = insertelement <4 x float> %108, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %109, i32 5, i32 2)
+  %110 = insertelement <4 x float> undef, float 0.000000e+00, i32 0
+  %111 = insertelement <4 x float> %110, float 0.000000e+00, i32 1
+  %112 = insertelement <4 x float> %111, float 0.000000e+00, i32 2
+  %113 = insertelement <4 x float> %112, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %113, i32 6, i32 2)
+  ret void
+}
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/r600cfg.ll b/test/CodeGen/R600/r600cfg.ll
index 895ad5e..6dee3ef 100644
--- a/test/CodeGen/R600/r600cfg.ll
+++ b/test/CodeGen/R600/r600cfg.ll
@@ -1,12 +1,12 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood
 ;REQUIRES: asserts
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
   %4 = bitcast float %0 to i32
   %5 = icmp eq i32 %4, 0
   %6 = sext i1 %5 to i32
@@ -113,12 +113,8 @@ ENDIF48:                                          ; preds = %LOOP47
   br label %LOOP47
 }
 
-; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
 declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32)
 
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
diff --git a/test/CodeGen/R600/reciprocal.ll b/test/CodeGen/R600/reciprocal.ll
index 2783929..b4ac47a 100644
--- a/test/CodeGen/R600/reciprocal.ll
+++ b/test/CodeGen/R600/reciprocal.ll
@@ -2,15 +2,14 @@
 
 ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test() {
-   %r0 = call float @llvm.R600.load.input(i32 0)
+define void @test(<4 x float> inreg %reg0) #0  {
+   %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = fdiv float 1.0, %r0
-   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   %vec = insertelement <4 x float> undef, float %r1, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0)
    ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-declare void @llvm.AMDGPU.store.output(float, i32)
-
-declare float @llvm.AMDGPU.rcp(float ) readnone
+attributes #0 = { "ShaderType"="0" }
diff --git a/test/CodeGen/R600/rotr.ll b/test/CodeGen/R600/rotr.ll
index 5c4c4e9..edf7aee 100644
--- a/test/CodeGen/R600/rotr.ll
+++ b/test/CodeGen/R600/rotr.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood -o - | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI -o - | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-; R600-CHECK: @rotr
+; R600-CHECK-LABEL: @rotr:
 ; R600-CHECK: BIT_ALIGN_INT
 
-; SI-CHECK: @rotr
+; SI-CHECK-LABEL: @rotr:
 ; SI-CHECK: V_ALIGNBIT_B32
 define void @rotr(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
@@ -16,14 +16,16 @@ entry:
   ret void
 }
 
-; R600-CHECK: @rotl
+; R600-CHECK-LABEL: @rotl:
 ; R600-CHECK: SUB_INT {{\** T[0-9]+\.[XYZW]}}, literal.x
 ; R600-CHECK-NEXT: 32
-; R600-CHECK: BIT_ALIGN_INT {{\** T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
+; R600-CHECK: BIT_ALIGN_INT {{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].Z, PV.{{[XYZW]}}
 
-; SI-CHECK: @rotl
-; SI-CHECK: V_SUB_I32_e64 [[DST:VGPR[0-9]+]], 32, {{[SV]GPR[0-9]+}}
-; SI-CHECK: V_ALIGNBIT_B32 {{VGPR[0-9]+, [SV]GPR[0-9]+, VGPR[0-9]+}}, [[DST]]
+
+; SI-CHECK-LABEL: @rotl:
+; SI-CHECK: S_SUB_I32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
+; SI-CHECK: V_MOV_B32_e32 [[VDST:v[0-9]+]], [[SDST]]
+; SI-CHECK: V_ALIGNBIT_B32 {{v[0-9]+, [s][0-9]+, v[0-9]+}}, [[VDST]]
 define void @rotl(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %0 = shl i32 %x, %y
diff --git a/test/CodeGen/R600/rv7x0_count3.ll b/test/CodeGen/R600/rv7x0_count3.ll
index 474d6ba..c3fd923 100644
--- a/test/CodeGen/R600/rv7x0_count3.ll
+++ b/test/CodeGen/R600/rv7x0_count3.ll
@@ -1,12 +1,12 @@
 ; RUN: llc < %s -march=r600 -show-mc-encoding  -mcpu=rv710 | FileCheck %s
 
-; CHECK: TEX 9 @4 ;  encoding: [0x04,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
+; CHECK: TEX 9 @6 ;  encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80]
 
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
-   %1 = call float @llvm.R600.load.input(i32 4)
-   %2 = call float @llvm.R600.load.input(i32 5)
-   %3 = call float @llvm.R600.load.input(i32 6)
-   %4 = call float @llvm.R600.load.input(i32 7)
+define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
+   %1 = extractelement <4 x float> %reg1, i32 0
+   %2 = extractelement <4 x float> %reg1, i32 1
+   %3 = extractelement <4 x float> %reg1, i32 2
+   %4 = extractelement <4 x float> %reg1, i32 3
    %5 = insertelement <4 x float> undef, float %1, i32 0
    %6 = insertelement <4 x float> %5, float %2, i32 1
    %7 = insertelement <4 x float> %6, float %3, i32 2
@@ -36,9 +36,6 @@ define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in)
 
 declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone
 
-; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
-
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
-attributes #1 = { readnone }
+
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/schedule-fs-loop-nested-if.ll b/test/CodeGen/R600/schedule-fs-loop-nested-if.ll
index ba9620c..11e8f51 100644
--- a/test/CodeGen/R600/schedule-fs-loop-nested-if.ll
+++ b/test/CodeGen/R600/schedule-fs-loop-nested-if.ll
@@ -1,12 +1,12 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main() {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 {
 main_body:
-  %0 = call float @llvm.R600.interp.input(i32 0, i32 0)
-  %1 = call float @llvm.R600.interp.input(i32 1, i32 0)
-  %2 = call float @llvm.R600.interp.input(i32 2, i32 0)
-  %3 = call float @llvm.R600.interp.input(i32 3, i32 0)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
   %4 = fcmp ult float %1, 0.000000e+00
   %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
   %6 = fsub float -0.000000e+00, %5
@@ -74,10 +74,9 @@ ELSE17:                                           ; preds = %ELSE
   br label %ENDIF
 }
 
-declare float @llvm.R600.interp.input(i32, i32) #0
-
 declare float @llvm.AMDIL.clamp.(float, float, float) #0
 
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { readnone }
+attributes #1 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/schedule-fs-loop-nested.ll b/test/CodeGen/R600/schedule-fs-loop-nested.ll
index 5e875c4..b917ec6 100644
--- a/test/CodeGen/R600/schedule-fs-loop-nested.ll
+++ b/test/CodeGen/R600/schedule-fs-loop-nested.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
 define void @main() {
diff --git a/test/CodeGen/R600/schedule-fs-loop.ll b/test/CodeGen/R600/schedule-fs-loop.ll
index d142cac..d6c194b 100644
--- a/test/CodeGen/R600/schedule-fs-loop.ll
+++ b/test/CodeGen/R600/schedule-fs-loop.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
 define void @main() {
diff --git a/test/CodeGen/R600/schedule-if-2.ll b/test/CodeGen/R600/schedule-if-2.ll
index 6afd677..38aad18 100644
--- a/test/CodeGen/R600/schedule-if-2.ll
+++ b/test/CodeGen/R600/schedule-if-2.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
 define void @main() {
diff --git a/test/CodeGen/R600/schedule-if.ll b/test/CodeGen/R600/schedule-if.ll
index 347d92f..f960c93 100644
--- a/test/CodeGen/R600/schedule-if.ll
+++ b/test/CodeGen/R600/schedule-if.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
+;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
 define void @main() {
diff --git a/test/CodeGen/R600/schedule-vs-if-nested-loop.ll b/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
index 44b7c2f..33b20d3 100644
--- a/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
+++ b/test/CodeGen/R600/schedule-vs-if-nested-loop.ll
@@ -1,12 +1,12 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched
 ;REQUIRES: asserts
 
-define void @main() {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
   %4 = fcmp ult float %0, 0.000000e+00
   %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
   %6 = fsub float -0.000000e+00, %5
@@ -127,8 +127,6 @@ ENDIF19:                                          ; preds = %ENDIF16
   br label %LOOP
 }
 
-declare float @llvm.R600.load.input(i32) #0
-
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { readnone }
+attributes #0 = { "ShaderType"="1" }
diff --git a/test/CodeGen/R600/select.ll b/test/CodeGen/R600/select.ll
new file mode 100644
index 0000000..f940142
--- /dev/null
+++ b/test/CodeGen/R600/select.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Normally icmp + select is optimized to select_cc, when this happens the
+; DAGLegalizer never sees the select and doesn't have a chance to leaglize it.
+;
+; In order to avoid the select_cc optimization, this test case calculates the
+; condition for the select in a separate basic block.
+
+; CHECK-LABEL: @select
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.X
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+; CHECK-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
+define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
+                     <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
+                     <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
+                     i32 %cond) {
+entry:
+  br label %for
+body:
+  %inc = add i32 %i, 1
+  %br_cmp.i = icmp eq i1 %br_cmp, 0
+  br label %for
+for:
+  %i = phi i32 [ %inc, %body], [ 0, %entry ]
+  %br_cmp = phi i1 [ %br_cmp.i, %body ], [ 0, %entry ]
+  %0 = icmp eq i32 %cond, %i
+  %1 = select i1 %br_cmp, i32 2, i32 3
+  %2 = select i1 %br_cmp, float 2.0 , float 5.0
+  %3 = select i1 %br_cmp, <2 x i32> <i32 2, i32 3>, <2 x i32> <i32 4, i32 5>
+  %4 = select i1 %br_cmp, <2 x float> <float 2.0, float 3.0>, <2 x float> <float 4.0, float 5.0>
+  %5 = select i1 %br_cmp, <4 x i32> <i32 2 , i32 3, i32 4, i32 5>, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+  %6 = select i1 %br_cmp, <4 x float> <float 2.0, float 3.0, float 4.0, float 5.0>, <4 x float> <float 6.0, float 7.0, float 8.0, float 9.0>
+  br i1 %0, label %body, label %done
+
+done:
+  store i32 %1, i32 addrspace(1)* %i32out
+  store float %2, float addrspace(1)* %f32out
+  store <2 x i32> %3, <2 x i32> addrspace(1)* %v2i32out
+  store <2 x float> %4, <2 x float> addrspace(1)* %v2f32out
+  store <4 x i32> %5, <4 x i32> addrspace(1)* %v4i32out
+  store <4 x float> %6, <4 x float> addrspace(1)* %v4f32out
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc-cnd.ll b/test/CodeGen/R600/selectcc-cnd.ll
index d7287b4..0bfca69 100644
--- a/test/CodeGen/R600/selectcc-cnd.ll
+++ b/test/CodeGen/R600/selectcc-cnd.ll
@@ -1,8 +1,8 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK-NOT: SETE
-;CHECK: CNDE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
-;CHECK-NEXT: {{[-0-9]+\(2.0}}
+;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
+;CHECK: 1073741824
 define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
   %1 = load float addrspace(1)* %in
   %2 = fcmp oeq float %1, 0.0
diff --git a/test/CodeGen/R600/selectcc-cnde-int.ll b/test/CodeGen/R600/selectcc-cnde-int.ll
index 768dc7d..d568888 100644
--- a/test/CodeGen/R600/selectcc-cnde-int.ll
+++ b/test/CodeGen/R600/selectcc-cnde-int.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
 
 ;CHECK-NOT: SETE_INT
-;CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
+;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
 ;CHECK-NEXT: 2
 define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %1 = load i32 addrspace(1)* %in
diff --git a/test/CodeGen/R600/selectcc-opt.ll b/test/CodeGen/R600/selectcc-opt.ll
index 7e2d559..834c030 100644
--- a/test/CodeGen/R600/selectcc-opt.ll
+++ b/test/CodeGen/R600/selectcc-opt.ll
@@ -6,7 +6,7 @@
 
 define void @test_a(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ult float %in, 0.000000e+00
+  %0 = fcmp olt float %in, 0.000000e+00
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -34,7 +34,7 @@ ENDIF:
 ; CHECK-NEXT: ALU clause starting
 define void @test_b(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ult float %in, 0.0
+  %0 = fcmp olt float %in, 0.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
diff --git a/test/CodeGen/R600/set-dx10.ll b/test/CodeGen/R600/set-dx10.ll
index 291a7bd..5c7d499 100644
--- a/test/CodeGen/R600/set-dx10.ll
+++ b/test/CodeGen/R600/set-dx10.ll
@@ -5,7 +5,8 @@
 ; SET*DX10 instructions.
 
 ; CHECK: @fcmp_une_select_fptosi
-; CHECK: SETNE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -18,7 +19,8 @@ entry:
 }
 
 ; CHECK: @fcmp_une_select_i32
-; CHECK: SETNE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: SETNE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
@@ -28,12 +30,13 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ueq_select_fptosi
-; CHECK: SETE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_oeq_select_fptosi
+; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ueq_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ueq float %in, 5.0
+  %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -41,23 +44,25 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ueq_select_i32
-; CHECK: SETE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_oeq_select_i32
+; CHECK: SETE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ueq_select_i32(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ueq float %in, 5.0
+  %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
-; CHECK: @fcmp_ugt_select_fptosi
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_ogt_select_fptosi
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ugt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ugt float %in, 5.0
+  %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -65,23 +70,25 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ugt_select_i32
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_ogt_select_i32
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ugt_select_i32(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ugt float %in, 5.0
+  %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
-; CHECK: @fcmp_uge_select_fptosi
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_oge_select_fptosi
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_uge_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp uge float %in, 5.0
+  %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -89,23 +96,25 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_uge_select_i32
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK: @fcmp_oge_select_i32
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.x,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_uge_select_i32(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp uge float %in, 5.0
+  %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
-; CHECK: @fcmp_ule_select_fptosi
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK: @fcmp_ole_select_fptosi
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ule_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ule float %in, 5.0
+  %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -113,23 +122,25 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ule_select_i32
-; CHECK: SETGE_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK: @fcmp_ole_select_i32
+; CHECK: SETGE_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ule_select_i32(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ule float %in, 5.0
+  %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
   store i32 %1, i32 addrspace(1)* %out
   ret void
 }
 
-; CHECK: @fcmp_ult_select_fptosi
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK: @fcmp_olt_select_fptosi
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ult_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ult float %in, 5.0
+  %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
   %2 = fsub float -0.000000e+00, %1
   %3 = fptosi float %2 to i32
@@ -137,12 +148,13 @@ entry:
   ret void
 }
 
-; CHECK: @fcmp_ult_select_i32
-; CHECK: SETGT_DX10 * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK: @fcmp_olt_select_i32
+; CHECK: SETGT_DX10 {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z,
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ult_select_i32(i32 addrspace(1)* %out, float %in) {
+define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
-  %0 = fcmp ult float %in, 5.0
+  %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
   store i32 %1, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/setcc.ll b/test/CodeGen/R600/setcc.ll
index 992de70..8d34c4a 100644
--- a/test/CodeGen/R600/setcc.ll
+++ b/test/CodeGen/R600/setcc.ll
@@ -1,8 +1,9 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 --check-prefix=FUNC %s
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
-; CHECK: @setcc_v2i32
-; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
-; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
+; FUNC-LABEL: @setcc_v2i32
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[3].X, KC0[3].Z
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW]}}, KC0[2].W, KC0[3].Y
 
 define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) {
   %result = icmp eq <2 x i32> %a, %b
@@ -11,11 +12,11 @@ define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %
   ret void
 }
 
-; CHECK: @setcc_v4i32
-; EG-CHECK-DAG: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; EG-CHECK-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; FUNC-LABEL: @setcc_v4i32
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; R600-DAG: SETE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -26,3 +27,307 @@ define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
   store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+;;;==========================================================================;;;
+;; Float comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: @f32_oeq
+; R600: SETE_DX10
+; SI: V_CMP_EQ_F32
+define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp oeq float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ogt
+; R600: SETGT_DX10
+; SI: V_CMP_GT_F32
+define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ogt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_oge
+; R600: SETGE_DX10
+; SI: V_CMP_GE_F32
+define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp oge float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_olt
+; R600: SETGT_DX10
+; SI: V_CMP_LT_F32
+define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp olt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ole
+; R600: SETGE_DX10
+; SI: V_CMP_LE_F32
+define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ole float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_one
+; R600-DAG: SETE_DX10
+; R600-DAG: SETE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_INT
+; SI: V_CMP_O_F32
+; SI: V_CMP_NEQ_F32
+; SI: S_AND_B64
+define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp one float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ord
+; R600-DAG: SETE_DX10
+; R600-DAG: SETE_DX10
+; R600-DAG: AND_INT
+; R600-DAG: SETNE_INT
+; SI: V_CMP_O_F32
+define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ord float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ueq
+; R600-DAG: SETNE_DX10
+; R600-DAG: SETNE_DX10
+; R600-DAG: OR_INT
+; R600-DAG: SETE_DX10
+; R600-DAG: OR_INT
+; R600-DAG: SETNE_INT
+; SI: V_CMP_U_F32
+; SI: V_CMP_EQ_F32
+; SI: S_OR_B64
+define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ueq float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ugt
+; R600: SETGE
+; R600: SETE_DX10
+; SI: V_CMP_U_F32
+; SI: V_CMP_GT_F32
+; SI: S_OR_B64
+define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ugt float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_uge
+; R600: SETGT
+; R600: SETE_DX10
+; SI: V_CMP_U_F32
+; SI: V_CMP_GE_F32
+; SI: S_OR_B64
+define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp uge float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ult
+; R600: SETGE
+; R600: SETE_DX10
+; SI: V_CMP_U_F32
+; SI: V_CMP_LT_F32
+; SI: S_OR_B64
+define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ult float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_ule
+; R600: SETGT
+; R600: SETE_DX10
+; SI: V_CMP_U_F32
+; SI: V_CMP_LE_F32
+; SI: S_OR_B64
+define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp ule float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_une
+; R600: SETNE_DX10
+; SI: V_CMP_NEQ_F32
+define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp une float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f32_uno
+; R600: SETNE_DX10
+; R600: SETNE_DX10
+; R600: OR_INT
+; R600: SETNE_INT
+; SI: V_CMP_U_F32
+define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) {
+entry:
+  %0 = fcmp uno float %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+;;;==========================================================================;;;
+;; 32-bit integer comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: @i32_eq
+; R600: SETE_INT
+; SI: V_CMP_EQ_I32
+define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp eq i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_ne
+; R600: SETNE_INT
+; SI: V_CMP_NE_I32
+define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ne i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_ugt
+; R600: SETGT_UINT
+; SI: V_CMP_GT_U32
+define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ugt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_uge
+; R600: SETGE_UINT
+; SI: V_CMP_GE_U32
+define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp uge i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_ult
+; R600: SETGT_UINT
+; SI: V_CMP_LT_U32
+define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ult i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_ule
+; R600: SETGE_UINT
+; SI: V_CMP_LE_U32
+define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp ule i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_sgt
+; R600: SETGT_INT
+; SI: V_CMP_GT_I32
+define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sgt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_sge
+; R600: SETGE_INT
+; SI: V_CMP_GE_I32
+define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sge i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_slt
+; R600: SETGT_INT
+; SI: V_CMP_LT_I32
+define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp slt i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i32_sle
+; R600: SETGE_INT
+; SI: V_CMP_LE_I32
+define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+entry:
+  %0 = icmp sle i32 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/setcc64.ll b/test/CodeGen/R600/setcc64.ll
new file mode 100644
index 0000000..9202fc0
--- /dev/null
+++ b/test/CodeGen/R600/setcc64.ll
@@ -0,0 +1,263 @@
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+
+; XXX: Merge this into setcc, once R600 supports 64-bit operations
+
+;;;==========================================================================;;;
+;; Double comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: @f64_oeq
+; SI: V_CMP_EQ_F64
+define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp oeq double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ogt
+; SI: V_CMP_GT_F64
+define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ogt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_oge
+; SI: V_CMP_GE_F64
+define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp oge double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_olt
+; SI: V_CMP_LT_F64
+define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp olt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ole
+; SI: V_CMP_LE_F64
+define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ole double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_one
+; SI: V_CMP_O_F64
+; SI: V_CMP_NEQ_F64
+; SI: S_AND_B64
+define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp one double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ord
+; SI: V_CMP_O_F64
+define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ord double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ueq
+; SI: V_CMP_U_F64
+; SI: V_CMP_EQ_F64
+; SI: S_OR_B64
+define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ueq double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ugt
+; SI: V_CMP_U_F64
+; SI: V_CMP_GT_F64
+; SI: S_OR_B64
+define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ugt double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_uge
+; SI: V_CMP_U_F64
+; SI: V_CMP_GE_F64
+; SI: S_OR_B64
+define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp uge double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ult
+; SI: V_CMP_U_F64
+; SI: V_CMP_LT_F64
+; SI: S_OR_B64
+define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ult double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_ule
+; SI: V_CMP_U_F64
+; SI: V_CMP_LE_F64
+; SI: S_OR_B64
+define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp ule double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_une
+; SI: V_CMP_NEQ_F64
+define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp une double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @f64_uno
+; SI: V_CMP_U_F64
+define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) {
+entry:
+  %0 = fcmp uno double %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+;;;==========================================================================;;;
+;; 64-bit integer comparisons
+;;;==========================================================================;;;
+
+; FUNC-LABEL: @i64_eq
+; SI: V_CMP_EQ_I64
+define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp eq i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_ne
+; SI: V_CMP_NE_I64
+define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ne i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_ugt
+; SI: V_CMP_GT_U64
+define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ugt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_uge
+; SI: V_CMP_GE_U64
+define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp uge i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_ult
+; SI: V_CMP_LT_U64
+define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ult i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_ule
+; SI: V_CMP_LE_U64
+define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp ule i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_sgt
+; SI: V_CMP_GT_I64
+define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sgt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_sge
+; SI: V_CMP_GE_I64
+define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sge i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_slt
+; SI: V_CMP_LT_I64
+define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp slt i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: @i64_sle
+; SI: V_CMP_LE_I64
+define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+entry:
+  %0 = icmp sle i64 %a, %b
+  %1 = sext i1 %0 to i32
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/seto.ll b/test/CodeGen/R600/seto.ll
index 19716f8..8633a4b 100644
--- a/test/CodeGen/R600/seto.ll
+++ b/test/CodeGen/R600/seto.ll
@@ -1,6 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_CMP_O_F32_e64 SGPR0_SGPR1, {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}, 0, 0, 0, 0
+;CHECK: V_CMP_O_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0, 0, 0
 
 define void @main(float %p) {
 main_body:
diff --git a/test/CodeGen/R600/setuo.ll b/test/CodeGen/R600/setuo.ll
index 929dbb1..c77a37e 100644
--- a/test/CodeGen/R600/setuo.ll
+++ b/test/CodeGen/R600/setuo.ll
@@ -1,6 +1,6 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
-;CHECK: V_CMP_U_F32_e64 SGPR0_SGPR1, {{[SV]GPR[0-9]+, [SV]GPR[0-9]+}}, 0, 0, 0, 0
+;CHECK: V_CMP_U_F32_e64 s[0:1], {{[sv][0-9]+, [sv][0-9]+}}, 0, 0, 0, 0
 
 define void @main(float %p) {
 main_body:
diff --git a/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
new file mode 100644
index 0000000..d74161b
--- /dev/null
+++ b/test/CodeGen/R600/sgpr-copy-duplicate-operand.ll
@@ -0,0 +1,18 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; Copy VGPR -> SGPR used twice as an instruction operand, which is then
+; used in an REG_SEQUENCE that also needs to be handled.
+
+; SI-LABEL: @test_dup_operands:
+; SI: V_ADD_I32_e32
+define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
+  %a = load <2 x i32> addrspace(1)* %in
+  %lo = extractelement <2 x i32> %a, i32 0
+  %hi = extractelement <2 x i32> %a, i32 1
+  %add = add i32 %lo, %lo
+  %vec0 = insertelement <2 x i32> undef, i32 %add, i32 0
+  %vec1 = insertelement <2 x i32> %vec0, i32 %hi, i32 1
+  store <2 x i32> %vec1, <2 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/R600/sgpr-copy.ll b/test/CodeGen/R600/sgpr-copy.ll
index b0d4549..5472c1b 100644
--- a/test/CodeGen/R600/sgpr-copy.ll
+++ b/test/CodeGen/R600/sgpr-copy.ll
@@ -1,15 +1,15 @@
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=SI  | FileCheck %s
 
 ; This test checks that no VGPR to SGPR copies are created by the register
 ; allocator.
-; CHECK: @main
-; CHECK: S_BUFFER_LOAD_DWORD [[DST:SGPR[0-9]]], {{[SGPR_[0-9]+}}, 0
-; CHECK: V_MOV_B32_e32 VGPR{{[0-9]}}, [[DST]]
+; CHECK-LABEL: @phi1
+; CHECK: S_BUFFER_LOAD_DWORD [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0
+; CHECK: V_MOV_B32_e32 v{{[0-9]}}, [[DST]]
 
-define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+define void @phi1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !0
+  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
   %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
   %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
@@ -28,13 +28,133 @@ ENDIF:                                            ; preds = %main_body, %ELSE
   ret void
 }
 
+; Make sure this program doesn't crash
+; CHECK-LABEL: @phi2
+define void @phi2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 32)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 36)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %21, i32 40)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %21, i32 48)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %21, i32 52)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %21, i32 56)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %21, i32 64)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %21, i32 68)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %21, i32 72)
+  %32 = call float @llvm.SI.load.const(<16 x i8> %21, i32 76)
+  %33 = call float @llvm.SI.load.const(<16 x i8> %21, i32 80)
+  %34 = call float @llvm.SI.load.const(<16 x i8> %21, i32 84)
+  %35 = call float @llvm.SI.load.const(<16 x i8> %21, i32 88)
+  %36 = call float @llvm.SI.load.const(<16 x i8> %21, i32 92)
+  %37 = getelementptr <32 x i8> addrspace(2)* %2, i32 0
+  %38 = load <32 x i8> addrspace(2)* %37, !tbaa !1
+  %39 = getelementptr <16 x i8> addrspace(2)* %1, i32 0
+  %40 = load <16 x i8> addrspace(2)* %39, !tbaa !1
+  %41 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
+  %42 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
+  %43 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %3, <2 x i32> %5)
+  %44 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %3, <2 x i32> %5)
+  %45 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %3, <2 x i32> %5)
+  %46 = bitcast float %41 to i32
+  %47 = bitcast float %42 to i32
+  %48 = insertelement <2 x i32> undef, i32 %46, i32 0
+  %49 = insertelement <2 x i32> %48, i32 %47, i32 1
+  %50 = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> %49, <32 x i8> %38, <16 x i8> %40, i32 2)
+  %51 = extractelement <4 x float> %50, i32 2
+  %52 = call float @fabs(float %51)
+  %53 = fmul float %43, %43
+  %54 = fmul float %44, %44
+  %55 = fadd float %54, %53
+  %56 = fmul float %45, %45
+  %57 = fadd float %55, %56
+  %58 = call float @llvm.AMDGPU.rsq(float %57)
+  %59 = fmul float %43, %58
+  %60 = fmul float %44, %58
+  %61 = fmul float %45, %58
+  %62 = fmul float %59, %23
+  %63 = fmul float %60, %24
+  %64 = fadd float %63, %62
+  %65 = fmul float %61, %25
+  %66 = fadd float %64, %65
+  %67 = fsub float -0.000000e+00, %26
+  %68 = fmul float %66, %52
+  %69 = fadd float %68, %67
+  %70 = fmul float %27, %69
+  %71 = fmul float %28, %69
+  %72 = call float @fabs(float %70)
+  %73 = fcmp olt float 0x3EE4F8B580000000, %72
+  %74 = sext i1 %73 to i32
+  %75 = bitcast i32 %74 to float
+  %76 = bitcast float %75 to i32
+  %77 = icmp ne i32 %76, 0
+  br i1 %77, label %IF, label %ENDIF
+
+IF:                                               ; preds = %main_body
+  %78 = fsub float -0.000000e+00, %70
+  %79 = call float @llvm.AMDIL.exp.(float %78)
+  %80 = fsub float -0.000000e+00, %79
+  %81 = fadd float 1.000000e+00, %80
+  %82 = fdiv float 1.000000e+00, %70
+  %83 = fmul float %81, %82
+  %84 = fmul float %32, %83
+  br label %ENDIF
+
+ENDIF:                                            ; preds = %main_body, %IF
+  %temp4.0 = phi float [ %84, %IF ], [ %32, %main_body ]
+  %85 = call float @fabs(float %71)
+  %86 = fcmp olt float 0x3EE4F8B580000000, %85
+  %87 = sext i1 %86 to i32
+  %88 = bitcast i32 %87 to float
+  %89 = bitcast float %88 to i32
+  %90 = icmp ne i32 %89, 0
+  br i1 %90, label %IF25, label %ENDIF24
+
+IF25:                                             ; preds = %ENDIF
+  %91 = fsub float -0.000000e+00, %71
+  %92 = call float @llvm.AMDIL.exp.(float %91)
+  %93 = fsub float -0.000000e+00, %92
+  %94 = fadd float 1.000000e+00, %93
+  %95 = fdiv float 1.000000e+00, %71
+  %96 = fmul float %94, %95
+  %97 = fmul float %36, %96
+  br label %ENDIF24
+
+ENDIF24:                                          ; preds = %ENDIF, %IF25
+  %temp8.0 = phi float [ %97, %IF25 ], [ %36, %ENDIF ]
+  %98 = fmul float %29, %temp4.0
+  %99 = fmul float %30, %temp4.0
+  %100 = fmul float %31, %temp4.0
+  %101 = fmul float %33, %temp8.0
+  %102 = fadd float %101, %98
+  %103 = fmul float %34, %temp8.0
+  %104 = fadd float %103, %99
+  %105 = fmul float %35, %temp8.0
+  %106 = fadd float %105, %100
+  %107 = call float @llvm.pow.f32(float %52, float %22)
+  %108 = fsub float -0.000000e+00, %102
+  %109 = fmul float %108, %107
+  %110 = fsub float -0.000000e+00, %104
+  %111 = fmul float %110, %107
+  %112 = fsub float -0.000000e+00, %106
+  %113 = fmul float %112, %107
+  %114 = call i32 @llvm.SI.packf16(float %109, float %111)
+  %115 = bitcast i32 %114 to float
+  %116 = call i32 @llvm.SI.packf16(float %113, float 1.000000e+00)
+  %117 = bitcast i32 %116 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %115, float %117, float %115, float %117)
+  ret void
+}
+
 ; We just want ot make sure the program doesn't crash
-; CHECK: @loop
+; CHECK-LABEL: @loop
 
 define void @loop(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 main_body:
   %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !0
+  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 0)
   %23 = call float @llvm.SI.load.const(<16 x i8> %21, i32 4)
   %24 = call float @llvm.SI.load.const(<16 x i8> %21, i32 8)
@@ -79,6 +199,129 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 attributes #0 = { "ShaderType"="0" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { readonly }
+attributes #3 = { readnone }
+attributes #4 = { nounwind readonly }
+
+!0 = metadata !{metadata !"const", null}
+!1 = metadata !{metadata !0, metadata !0, i64 0, i32 1}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq(float) #3
+
+; Function Attrs: readnone
+declare float @llvm.AMDIL.exp.(float) #3
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #4
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+; This checks for a bug in the FixSGPRCopies pass where VReg96
+; registers were being identified as an SGPR regclass which was causing
+; an assertion failure.
+
+; CHECK-LABEL: @sample_v3
+; CHECK: IMAGE_SAMPLE
+; CHECK: IMAGE_SAMPLE
+; CHECK: EXP
+; CHECK: S_ENDPGM
+define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
 
-!0 = metadata !{metadata !"const", null, i32 1}
+entry:
+  %21 = getelementptr [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8> addrspace(2)* %21, !tbaa !2
+  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 16)
+  %24 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %25 = load <32 x i8> addrspace(2)* %24, !tbaa !2
+  %26 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %27 = load <16 x i8> addrspace(2)* %26, !tbaa !2
+  %28 = fcmp oeq float %23, 0.0
+  br i1 %28, label %if, label %else
+
+if:
+  %val.if = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 0, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+  %val.if.0 = extractelement <4 x float> %val.if, i32 0
+  %val.if.1 = extractelement <4 x float> %val.if, i32 1
+  %val.if.2 = extractelement <4 x float> %val.if, i32 2
+  br label %endif
+
+else:
+  %val.else = call <4 x float> @llvm.SI.sample.v2i32(<2 x i32> <i32 1, i32 0>, <32 x i8> %25, <16 x i8> %27, i32 2)
+  %val.else.0 = extractelement <4 x float> %val.else, i32 0
+  %val.else.1 = extractelement <4 x float> %val.else, i32 1
+  %val.else.2 = extractelement <4 x float> %val.else, i32 2
+  br label %endif
+
+endif:
+  %val.0 = phi float [%val.if.0, %if], [%val.else.0, %else]
+  %val.1 = phi float [%val.if.1, %if], [%val.else.1, %else]
+  %val.2 = phi float [%val.if.2, %if], [%val.else.2, %else]
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.0)
+  ret void
+}
+
+!2 = metadata !{metadata !"const", null, i32 1}
+
+; CHECK-LABEL: @copy1
+; CHECK: BUFFER_LOAD_DWORD
+; CHECK: V_ADD
+; CHECK: S_ENDPGM
+define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
+entry:
+  %0 = load float addrspace(1)* %in0
+  %1 = fcmp oeq float %0, 0.0
+  br i1 %1, label %if0, label %endif
+
+if0:
+  %2 = bitcast float %0 to i32
+  %3 = fcmp olt float %0, 0.0
+  br i1 %3, label %if1, label %endif
+
+if1:
+  %4 = add i32 %2, 1
+  br label %endif
+
+endif:
+  %5 = phi i32 [ 0, %entry ], [ %2, %if0 ], [ %4, %if1 ]
+  %6 = bitcast i32 %5 to float
+  store float %6, float addrspace(1)* %out
+  ret void
+}
+
+; This test is just checking that we don't crash / assertion fail.
+; CHECK-LABEL: @copy2
+; CHECK: S_ENDPGM
+
+define void @copy2([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+entry:
+  br label %LOOP68
+
+LOOP68:
+  %temp4.7 = phi float [ 0.000000e+00, %entry ], [ %v, %ENDIF69 ]
+  %t = phi i32 [ 20, %entry ], [ %x, %ENDIF69 ]
+  %g = icmp eq i32 0, %t
+  %l = bitcast float %temp4.7 to i32
+  br i1 %g, label %IF70, label %ENDIF69
+
+IF70:
+  %q = icmp ne i32 %l, 13
+  %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  ret void
+
+ENDIF69:
+  %u = add i32 %l, %t
+  %v = bitcast i32 %u to float
+  %x = add i32 %t, -1
+  br label %LOOP68
+}
+
+attributes #0 = { "ShaderType"="0" }
 
diff --git a/test/CodeGen/R600/shared-op-cycle.ll b/test/CodeGen/R600/shared-op-cycle.ll
new file mode 100644
index 0000000..0484fc9
--- /dev/null
+++ b/test/CodeGen/R600/shared-op-cycle.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @main
+; CHECK: MULADD_IEEE *
+; CHECK-NOT: MULADD_IEEE *
+
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 {
+   %w0 = extractelement <4 x float> %reg0, i32 3
+   %w1 = extractelement <4 x float> %reg1, i32 3
+   %w2 = extractelement <4 x float> %reg2, i32 3
+   %sq0 = fmul float %w0, %w0
+   %r0 = fadd float %sq0, 2.0
+   %sq1 = fmul float %w1, %w1
+   %r1 = fadd float %sq1, 2.0
+   %sq2 = fmul float %w2, %w2
+   %r2 = fadd float %sq2, 2.0
+   %v0 = insertelement <4 x float> undef, float %r0, i32 0
+   %v1 = insertelement <4 x float> %v0, float %r1, i32 1
+   %v2 = insertelement <4 x float> %v1, float %r2, i32 2
+   %res = call float @llvm.AMDGPU.dp4(<4 x float> %v2, <4 x float> %v2)
+   %vecres = insertelement <4 x float> undef, float %res, i32 0
+   call void @llvm.R600.store.swizzle(<4 x float> %vecres, i32 0, i32 2)
+   ret void
+}
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
+
+declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { readnone }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/shl.ll b/test/CodeGen/R600/shl.ll
index d99e325..4a6aab4 100644
--- a/test/CodeGen/R600/shl.ll
+++ b/test/CodeGen/R600/shl.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @shl_v2i32
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @shl_v2i32
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -25,10 +25,10 @@ define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 ;EG-CHECK: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @shl_v4i32
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHL_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHL_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/short-args.ll b/test/CodeGen/R600/short-args.ll
deleted file mode 100644
index 20d0ae4..0000000
--- a/test/CodeGen/R600/short-args.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
-
-; EG-CHECK: @i8_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: BUFFER_LOAD_UBYTE
-
-define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
-entry:
-  %0 = zext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; EG-CHECK: @i8_zext_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
-
-define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
-entry:
-  %0 = zext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; EG-CHECK: @i8_sext_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
-
-define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
-entry:
-  %0 = sext i8 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; EG-CHECK: @i16_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: BUFFER_LOAD_USHORT
-
-define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
-entry:
-  %0 = zext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; EG-CHECK: @i16_zext_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
-
-define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
-entry:
-  %0 = zext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; EG-CHECK: @i16_sext_arg
-; EG-CHECK: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-; SI-CHECK: S_LOAD_DWORD SGPR{{[0-9]}}, SGPR0_SGPR1, 11
-
-define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
-entry:
-  %0 = sext i16 %in to i32
-  store i32 %0, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/R600/si-annotate-cf-assertion.ll b/test/CodeGen/R600/si-annotate-cf-assertion.ll
new file mode 100644
index 0000000..9886fe9
--- /dev/null
+++ b/test/CodeGen/R600/si-annotate-cf-assertion.ll
@@ -0,0 +1,23 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=SI -asm-verbose=false < %s | FileCheck %s
+
+
+define void @test(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+; CHECK-LABEL: @test:
+
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 60, label %sw.bb
+  ]
+
+sw.bb:
+  unreachable
+
+sw.default:
+  unreachable
+
+sw.epilog:
+  ret void
+}
+
diff --git a/test/CodeGen/R600/si-lod-bias.ll b/test/CodeGen/R600/si-lod-bias.ll
new file mode 100644
index 0000000..8d7a79c
--- /dev/null
+++ b/test/CodeGen/R600/si-lod-bias.ll
@@ -0,0 +1,51 @@
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+; This shader has the potential to generated illegal VGPR to SGPR copies if
+; the wrong register class is used for the REG_SEQUENCE instructions.
+
+; CHECK: @main
+; CHECK: IMAGE_SAMPLE_B v{{\[[0-9]:[0-9]\]}}, 15, 0, 0, 0, 0, 0, 0, 0, v{{\[[0-9]:[0-9]\]}}
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %20 = getelementptr <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8> addrspace(2)* %20, !tbaa !1
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
+  %23 = getelementptr <32 x i8> addrspace(2)* %2, i32 0
+  %24 = load <32 x i8> addrspace(2)* %23, !tbaa !1
+  %25 = getelementptr <16 x i8> addrspace(2)* %1, i32 0
+  %26 = load <16 x i8> addrspace(2)* %25, !tbaa !1
+  %27 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %5)
+  %28 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %5)
+  %29 = bitcast float %22 to i32
+  %30 = bitcast float %27 to i32
+  %31 = bitcast float %28 to i32
+  %32 = insertelement <4 x i32> undef, i32 %29, i32 0
+  %33 = insertelement <4 x i32> %32, i32 %30, i32 1
+  %34 = insertelement <4 x i32> %33, i32 %31, i32 2
+  %35 = insertelement <4 x i32> %34, i32 undef, i32 3
+  %36 = call <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32> %35, <32 x i8> %24, <16 x i8> %26, i32 2)
+  %37 = extractelement <4 x float> %36, i32 0
+  %38 = extractelement <4 x float> %36, i32 1
+  %39 = extractelement <4 x float> %36, i32 2
+  %40 = extractelement <4 x float> %36, i32 3
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %37, float %38, float %39, float %40)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sampleb.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+
+!0 = metadata !{metadata !"const", null}
+!1 = metadata !{metadata !0, metadata !0, i64 0, i32 1}
diff --git a/test/CodeGen/R600/si-sgpr-spill.ll b/test/CodeGen/R600/si-sgpr-spill.ll
new file mode 100644
index 0000000..05c5e31
--- /dev/null
+++ b/test/CodeGen/R600/si-sgpr-spill.ll
@@ -0,0 +1,692 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck %s
+
+; XXX: Enable when spilling is supported
+; XFAIL: *
+
+; These tests check that the compiler won't crash when it needs to spill
+; SGPRs.
+
+; CHECK-LABEL: @main
+; Writing to M0 from an SMRD instruction will hang the GPU.
+; CHECK-NOT: S_BUFFER_LOAD_DWORD m0
+; CHECK: S_ENDPGM
+@ddxy_lds = external addrspace(3) global [64 x i32]
+
+define void @main([17 x <16 x i8>] addrspace(2)* byval, [32 x <16 x i8>] addrspace(2)* byval, [16 x <32 x i8>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
+main_body:
+  %21 = getelementptr [17 x <16 x i8>] addrspace(2)* %0, i64 0, i32 0
+  %22 = load <16 x i8> addrspace(2)* %21, !tbaa !0
+  %23 = call float @llvm.SI.load.const(<16 x i8> %22, i32 96)
+  %24 = call float @llvm.SI.load.const(<16 x i8> %22, i32 100)
+  %25 = call float @llvm.SI.load.const(<16 x i8> %22, i32 104)
+  %26 = call float @llvm.SI.load.const(<16 x i8> %22, i32 112)
+  %27 = call float @llvm.SI.load.const(<16 x i8> %22, i32 116)
+  %28 = call float @llvm.SI.load.const(<16 x i8> %22, i32 120)
+  %29 = call float @llvm.SI.load.const(<16 x i8> %22, i32 128)
+  %30 = call float @llvm.SI.load.const(<16 x i8> %22, i32 132)
+  %31 = call float @llvm.SI.load.const(<16 x i8> %22, i32 140)
+  %32 = call float @llvm.SI.load.const(<16 x i8> %22, i32 144)
+  %33 = call float @llvm.SI.load.const(<16 x i8> %22, i32 160)
+  %34 = call float @llvm.SI.load.const(<16 x i8> %22, i32 176)
+  %35 = call float @llvm.SI.load.const(<16 x i8> %22, i32 180)
+  %36 = call float @llvm.SI.load.const(<16 x i8> %22, i32 184)
+  %37 = call float @llvm.SI.load.const(<16 x i8> %22, i32 192)
+  %38 = call float @llvm.SI.load.const(<16 x i8> %22, i32 196)
+  %39 = call float @llvm.SI.load.const(<16 x i8> %22, i32 200)
+  %40 = call float @llvm.SI.load.const(<16 x i8> %22, i32 208)
+  %41 = call float @llvm.SI.load.const(<16 x i8> %22, i32 212)
+  %42 = call float @llvm.SI.load.const(<16 x i8> %22, i32 216)
+  %43 = call float @llvm.SI.load.const(<16 x i8> %22, i32 224)
+  %44 = call float @llvm.SI.load.const(<16 x i8> %22, i32 240)
+  %45 = call float @llvm.SI.load.const(<16 x i8> %22, i32 244)
+  %46 = call float @llvm.SI.load.const(<16 x i8> %22, i32 248)
+  %47 = call float @llvm.SI.load.const(<16 x i8> %22, i32 256)
+  %48 = call float @llvm.SI.load.const(<16 x i8> %22, i32 272)
+  %49 = call float @llvm.SI.load.const(<16 x i8> %22, i32 276)
+  %50 = call float @llvm.SI.load.const(<16 x i8> %22, i32 280)
+  %51 = call float @llvm.SI.load.const(<16 x i8> %22, i32 288)
+  %52 = call float @llvm.SI.load.const(<16 x i8> %22, i32 292)
+  %53 = call float @llvm.SI.load.const(<16 x i8> %22, i32 296)
+  %54 = call float @llvm.SI.load.const(<16 x i8> %22, i32 304)
+  %55 = call float @llvm.SI.load.const(<16 x i8> %22, i32 308)
+  %56 = call float @llvm.SI.load.const(<16 x i8> %22, i32 312)
+  %57 = call float @llvm.SI.load.const(<16 x i8> %22, i32 368)
+  %58 = call float @llvm.SI.load.const(<16 x i8> %22, i32 372)
+  %59 = call float @llvm.SI.load.const(<16 x i8> %22, i32 376)
+  %60 = call float @llvm.SI.load.const(<16 x i8> %22, i32 384)
+  %61 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 0
+  %62 = load <32 x i8> addrspace(2)* %61, !tbaa !0
+  %63 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 0
+  %64 = load <16 x i8> addrspace(2)* %63, !tbaa !0
+  %65 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 1
+  %66 = load <32 x i8> addrspace(2)* %65, !tbaa !0
+  %67 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 1
+  %68 = load <16 x i8> addrspace(2)* %67, !tbaa !0
+  %69 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 2
+  %70 = load <32 x i8> addrspace(2)* %69, !tbaa !0
+  %71 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 2
+  %72 = load <16 x i8> addrspace(2)* %71, !tbaa !0
+  %73 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 3
+  %74 = load <32 x i8> addrspace(2)* %73, !tbaa !0
+  %75 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 3
+  %76 = load <16 x i8> addrspace(2)* %75, !tbaa !0
+  %77 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 4
+  %78 = load <32 x i8> addrspace(2)* %77, !tbaa !0
+  %79 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 4
+  %80 = load <16 x i8> addrspace(2)* %79, !tbaa !0
+  %81 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 5
+  %82 = load <32 x i8> addrspace(2)* %81, !tbaa !0
+  %83 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 5
+  %84 = load <16 x i8> addrspace(2)* %83, !tbaa !0
+  %85 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 6
+  %86 = load <32 x i8> addrspace(2)* %85, !tbaa !0
+  %87 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 6
+  %88 = load <16 x i8> addrspace(2)* %87, !tbaa !0
+  %89 = getelementptr [16 x <32 x i8>] addrspace(2)* %2, i64 0, i32 7
+  %90 = load <32 x i8> addrspace(2)* %89, !tbaa !0
+  %91 = getelementptr [32 x <16 x i8>] addrspace(2)* %1, i64 0, i32 7
+  %92 = load <16 x i8> addrspace(2)* %91, !tbaa !0
+  %93 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %4, <2 x i32> %6)
+  %94 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %4, <2 x i32> %6)
+  %95 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %4, <2 x i32> %6)
+  %96 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %4, <2 x i32> %6)
+  %97 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %4, <2 x i32> %6)
+  %98 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %4, <2 x i32> %6)
+  %99 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %4, <2 x i32> %6)
+  %100 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %4, <2 x i32> %6)
+  %101 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %4, <2 x i32> %6)
+  %102 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %4, <2 x i32> %6)
+  %103 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %4, <2 x i32> %6)
+  %104 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %4, <2 x i32> %6)
+  %105 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %4, <2 x i32> %6)
+  %106 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %4, <2 x i32> %6)
+  %107 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %4, <2 x i32> %6)
+  %108 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %4, <2 x i32> %6)
+  %109 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %4, <2 x i32> %6)
+  %110 = call i32 @llvm.SI.tid()
+  %111 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %110
+  %112 = bitcast float %93 to i32
+  store i32 %112, i32 addrspace(3)* %111
+  %113 = bitcast float %94 to i32
+  store i32 %113, i32 addrspace(3)* %111
+  %114 = call i32 @llvm.SI.tid()
+  %115 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %114
+  %116 = and i32 %114, -4
+  %117 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %116
+  %118 = add i32 %116, 1
+  %119 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %118
+  %120 = bitcast float %93 to i32
+  store i32 %120, i32 addrspace(3)* %115
+  %121 = load i32 addrspace(3)* %117
+  %122 = bitcast i32 %121 to float
+  %123 = load i32 addrspace(3)* %119
+  %124 = bitcast i32 %123 to float
+  %125 = fsub float %124, %122
+  %126 = bitcast float %94 to i32
+  store i32 %126, i32 addrspace(3)* %115
+  %127 = load i32 addrspace(3)* %117
+  %128 = bitcast i32 %127 to float
+  %129 = load i32 addrspace(3)* %119
+  %130 = bitcast i32 %129 to float
+  %131 = fsub float %130, %128
+  %132 = insertelement <4 x float> undef, float %125, i32 0
+  %133 = insertelement <4 x float> %132, float %131, i32 1
+  %134 = insertelement <4 x float> %133, float %131, i32 2
+  %135 = insertelement <4 x float> %134, float %131, i32 3
+  %136 = extractelement <4 x float> %135, i32 0
+  %137 = extractelement <4 x float> %135, i32 1
+  %138 = fmul float %60, %93
+  %139 = fmul float %60, %94
+  %140 = fmul float %60, %94
+  %141 = fmul float %60, %94
+  %142 = call i32 @llvm.SI.tid()
+  %143 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %142
+  %144 = bitcast float %138 to i32
+  store i32 %144, i32 addrspace(3)* %143
+  %145 = bitcast float %139 to i32
+  store i32 %145, i32 addrspace(3)* %143
+  %146 = bitcast float %140 to i32
+  store i32 %146, i32 addrspace(3)* %143
+  %147 = bitcast float %141 to i32
+  store i32 %147, i32 addrspace(3)* %143
+  %148 = call i32 @llvm.SI.tid()
+  %149 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %148
+  %150 = and i32 %148, -4
+  %151 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %150
+  %152 = add i32 %150, 2
+  %153 = getelementptr [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %152
+  %154 = bitcast float %138 to i32
+  store i32 %154, i32 addrspace(3)* %149
+  %155 = load i32 addrspace(3)* %151
+  %156 = bitcast i32 %155 to float
+  %157 = load i32 addrspace(3)* %153
+  %158 = bitcast i32 %157 to float
+  %159 = fsub float %158, %156
+  %160 = bitcast float %139 to i32
+  store i32 %160, i32 addrspace(3)* %149
+  %161 = load i32 addrspace(3)* %151
+  %162 = bitcast i32 %161 to float
+  %163 = load i32 addrspace(3)* %153
+  %164 = bitcast i32 %163 to float
+  %165 = fsub float %164, %162
+  %166 = bitcast float %140 to i32
+  store i32 %166, i32 addrspace(3)* %149
+  %167 = load i32 addrspace(3)* %151
+  %168 = bitcast i32 %167 to float
+  %169 = load i32 addrspace(3)* %153
+  %170 = bitcast i32 %169 to float
+  %171 = fsub float %170, %168
+  %172 = bitcast float %141 to i32
+  store i32 %172, i32 addrspace(3)* %149
+  %173 = load i32 addrspace(3)* %151
+  %174 = bitcast i32 %173 to float
+  %175 = load i32 addrspace(3)* %153
+  %176 = bitcast i32 %175 to float
+  %177 = fsub float %176, %174
+  %178 = insertelement <4 x float> undef, float %159, i32 0
+  %179 = insertelement <4 x float> %178, float %165, i32 1
+  %180 = insertelement <4 x float> %179, float %171, i32 2
+  %181 = insertelement <4 x float> %180, float %177, i32 3
+  %182 = extractelement <4 x float> %181, i32 0
+  %183 = extractelement <4 x float> %181, i32 1
+  %184 = fdiv float 1.000000e+00, %97
+  %185 = fmul float %33, %184
+  %186 = fcmp uge float 1.000000e+00, %185
+  %187 = select i1 %186, float %185, float 1.000000e+00
+  %188 = fmul float %187, %30
+  %189 = call float @ceil(float %188)
+  %190 = fcmp uge float 3.000000e+00, %189
+  %191 = select i1 %190, float 3.000000e+00, float %189
+  %192 = fdiv float 1.000000e+00, %191
+  %193 = fdiv float 1.000000e+00, %30
+  %194 = fmul float %191, %193
+  %195 = fmul float %31, %194
+  %196 = fmul float %95, %95
+  %197 = fmul float %96, %96
+  %198 = fadd float %197, %196
+  %199 = fmul float %97, %97
+  %200 = fadd float %198, %199
+  %201 = call float @llvm.AMDGPU.rsq(float %200)
+  %202 = fmul float %95, %201
+  %203 = fmul float %96, %201
+  %204 = fmul float %202, %29
+  %205 = fmul float %203, %29
+  %206 = fmul float %204, -1.000000e+00
+  %207 = fmul float %205, 1.000000e+00
+  %208 = fmul float %206, %32
+  %209 = fmul float %207, %32
+  %210 = fsub float -0.000000e+00, %208
+  %211 = fadd float %93, %210
+  %212 = fsub float -0.000000e+00, %209
+  %213 = fadd float %94, %212
+  %214 = fmul float %206, %192
+  %215 = fmul float %207, %192
+  %216 = fmul float -1.000000e+00, %192
+  %217 = bitcast float %136 to i32
+  %218 = bitcast float %182 to i32
+  %219 = bitcast float %137 to i32
+  %220 = bitcast float %183 to i32
+  %221 = insertelement <8 x i32> undef, i32 %217, i32 0
+  %222 = insertelement <8 x i32> %221, i32 %218, i32 1
+  %223 = insertelement <8 x i32> %222, i32 %219, i32 2
+  %224 = insertelement <8 x i32> %223, i32 %220, i32 3
+  br label %LOOP
+
+LOOP:                                             ; preds = %ENDIF, %main_body
+  %temp24.0 = phi float [ 1.000000e+00, %main_body ], [ %258, %ENDIF ]
+  %temp28.0 = phi float [ %211, %main_body ], [ %253, %ENDIF ]
+  %temp29.0 = phi float [ %213, %main_body ], [ %255, %ENDIF ]
+  %temp30.0 = phi float [ 1.000000e+00, %main_body ], [ %257, %ENDIF ]
+  %225 = fcmp oge float %temp24.0, %191
+  %226 = sext i1 %225 to i32
+  %227 = bitcast i32 %226 to float
+  %228 = bitcast float %227 to i32
+  %229 = icmp ne i32 %228, 0
+  br i1 %229, label %IF, label %ENDIF
+
+IF:                                               ; preds = %LOOP
+  %230 = bitcast float %136 to i32
+  %231 = bitcast float %182 to i32
+  %232 = bitcast float %137 to i32
+  %233 = bitcast float %183 to i32
+  %234 = insertelement <8 x i32> undef, i32 %230, i32 0
+  %235 = insertelement <8 x i32> %234, i32 %231, i32 1
+  %236 = insertelement <8 x i32> %235, i32 %232, i32 2
+  %237 = insertelement <8 x i32> %236, i32 %233, i32 3
+  br label %LOOP65
+
+ENDIF:                                            ; preds = %LOOP
+  %238 = bitcast float %temp28.0 to i32
+  %239 = bitcast float %temp29.0 to i32
+  %240 = insertelement <8 x i32> %224, i32 %238, i32 4
+  %241 = insertelement <8 x i32> %240, i32 %239, i32 5
+  %242 = insertelement <8 x i32> %241, i32 undef, i32 6
+  %243 = insertelement <8 x i32> %242, i32 undef, i32 7
+  %244 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %243, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %245 = extractelement <4 x float> %244, i32 3
+  %246 = fcmp oge float %temp30.0, %245
+  %247 = sext i1 %246 to i32
+  %248 = bitcast i32 %247 to float
+  %249 = bitcast float %248 to i32
+  %250 = and i32 %249, 1065353216
+  %251 = bitcast i32 %250 to float
+  %252 = fmul float %214, %251
+  %253 = fadd float %252, %temp28.0
+  %254 = fmul float %215, %251
+  %255 = fadd float %254, %temp29.0
+  %256 = fmul float %216, %251
+  %257 = fadd float %256, %temp30.0
+  %258 = fadd float %temp24.0, 1.000000e+00
+  br label %LOOP
+
+LOOP65:                                           ; preds = %ENDIF66, %IF
+  %temp24.1 = phi float [ 0.000000e+00, %IF ], [ %610, %ENDIF66 ]
+  %temp28.1 = phi float [ %temp28.0, %IF ], [ %605, %ENDIF66 ]
+  %temp29.1 = phi float [ %temp29.0, %IF ], [ %607, %ENDIF66 ]
+  %temp30.1 = phi float [ %temp30.0, %IF ], [ %609, %ENDIF66 ]
+  %temp32.0 = phi float [ 1.000000e+00, %IF ], [ %611, %ENDIF66 ]
+  %259 = fcmp oge float %temp24.1, %195
+  %260 = sext i1 %259 to i32
+  %261 = bitcast i32 %260 to float
+  %262 = bitcast float %261 to i32
+  %263 = icmp ne i32 %262, 0
+  br i1 %263, label %IF67, label %ENDIF66
+
+IF67:                                             ; preds = %LOOP65
+  %264 = bitcast float %136 to i32
+  %265 = bitcast float %182 to i32
+  %266 = bitcast float %137 to i32
+  %267 = bitcast float %183 to i32
+  %268 = bitcast float %temp28.1 to i32
+  %269 = bitcast float %temp29.1 to i32
+  %270 = insertelement <8 x i32> undef, i32 %264, i32 0
+  %271 = insertelement <8 x i32> %270, i32 %265, i32 1
+  %272 = insertelement <8 x i32> %271, i32 %266, i32 2
+  %273 = insertelement <8 x i32> %272, i32 %267, i32 3
+  %274 = insertelement <8 x i32> %273, i32 %268, i32 4
+  %275 = insertelement <8 x i32> %274, i32 %269, i32 5
+  %276 = insertelement <8 x i32> %275, i32 undef, i32 6
+  %277 = insertelement <8 x i32> %276, i32 undef, i32 7
+  %278 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %277, <32 x i8> %66, <16 x i8> %68, i32 2)
+  %279 = extractelement <4 x float> %278, i32 0
+  %280 = extractelement <4 x float> %278, i32 1
+  %281 = extractelement <4 x float> %278, i32 2
+  %282 = extractelement <4 x float> %278, i32 3
+  %283 = fmul float %282, %47
+  %284 = bitcast float %136 to i32
+  %285 = bitcast float %182 to i32
+  %286 = bitcast float %137 to i32
+  %287 = bitcast float %183 to i32
+  %288 = bitcast float %temp28.1 to i32
+  %289 = bitcast float %temp29.1 to i32
+  %290 = insertelement <8 x i32> undef, i32 %284, i32 0
+  %291 = insertelement <8 x i32> %290, i32 %285, i32 1
+  %292 = insertelement <8 x i32> %291, i32 %286, i32 2
+  %293 = insertelement <8 x i32> %292, i32 %287, i32 3
+  %294 = insertelement <8 x i32> %293, i32 %288, i32 4
+  %295 = insertelement <8 x i32> %294, i32 %289, i32 5
+  %296 = insertelement <8 x i32> %295, i32 undef, i32 6
+  %297 = insertelement <8 x i32> %296, i32 undef, i32 7
+  %298 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %297, <32 x i8> %82, <16 x i8> %84, i32 2)
+  %299 = extractelement <4 x float> %298, i32 0
+  %300 = extractelement <4 x float> %298, i32 1
+  %301 = extractelement <4 x float> %298, i32 2
+  %302 = bitcast float %136 to i32
+  %303 = bitcast float %182 to i32
+  %304 = bitcast float %137 to i32
+  %305 = bitcast float %183 to i32
+  %306 = bitcast float %temp28.1 to i32
+  %307 = bitcast float %temp29.1 to i32
+  %308 = insertelement <8 x i32> undef, i32 %302, i32 0
+  %309 = insertelement <8 x i32> %308, i32 %303, i32 1
+  %310 = insertelement <8 x i32> %309, i32 %304, i32 2
+  %311 = insertelement <8 x i32> %310, i32 %305, i32 3
+  %312 = insertelement <8 x i32> %311, i32 %306, i32 4
+  %313 = insertelement <8 x i32> %312, i32 %307, i32 5
+  %314 = insertelement <8 x i32> %313, i32 undef, i32 6
+  %315 = insertelement <8 x i32> %314, i32 undef, i32 7
+  %316 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %315, <32 x i8> %78, <16 x i8> %80, i32 2)
+  %317 = extractelement <4 x float> %316, i32 0
+  %318 = extractelement <4 x float> %316, i32 1
+  %319 = extractelement <4 x float> %316, i32 2
+  %320 = fmul float %317, %23
+  %321 = fmul float %318, %24
+  %322 = fmul float %319, %25
+  %323 = fmul float %299, %26
+  %324 = fadd float %323, %320
+  %325 = fmul float %300, %27
+  %326 = fadd float %325, %321
+  %327 = fmul float %301, %28
+  %328 = fadd float %327, %322
+  %329 = fadd float %279, %324
+  %330 = fadd float %280, %326
+  %331 = fadd float %281, %328
+  %332 = bitcast float %136 to i32
+  %333 = bitcast float %182 to i32
+  %334 = bitcast float %137 to i32
+  %335 = bitcast float %183 to i32
+  %336 = bitcast float %temp28.1 to i32
+  %337 = bitcast float %temp29.1 to i32
+  %338 = insertelement <8 x i32> undef, i32 %332, i32 0
+  %339 = insertelement <8 x i32> %338, i32 %333, i32 1
+  %340 = insertelement <8 x i32> %339, i32 %334, i32 2
+  %341 = insertelement <8 x i32> %340, i32 %335, i32 3
+  %342 = insertelement <8 x i32> %341, i32 %336, i32 4
+  %343 = insertelement <8 x i32> %342, i32 %337, i32 5
+  %344 = insertelement <8 x i32> %343, i32 undef, i32 6
+  %345 = insertelement <8 x i32> %344, i32 undef, i32 7
+  %346 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %345, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %347 = extractelement <4 x float> %346, i32 0
+  %348 = extractelement <4 x float> %346, i32 1
+  %349 = extractelement <4 x float> %346, i32 2
+  %350 = fadd float %347, -5.000000e-01
+  %351 = fadd float %348, -5.000000e-01
+  %352 = fadd float %349, -5.000000e-01
+  %353 = fmul float %350, %350
+  %354 = fmul float %351, %351
+  %355 = fadd float %354, %353
+  %356 = fmul float %352, %352
+  %357 = fadd float %355, %356
+  %358 = call float @llvm.AMDGPU.rsq(float %357)
+  %359 = fmul float %350, %358
+  %360 = fmul float %351, %358
+  %361 = fmul float %352, %358
+  %362 = bitcast float %136 to i32
+  %363 = bitcast float %182 to i32
+  %364 = bitcast float %137 to i32
+  %365 = bitcast float %183 to i32
+  %366 = bitcast float %temp28.1 to i32
+  %367 = bitcast float %temp29.1 to i32
+  %368 = insertelement <8 x i32> undef, i32 %362, i32 0
+  %369 = insertelement <8 x i32> %368, i32 %363, i32 1
+  %370 = insertelement <8 x i32> %369, i32 %364, i32 2
+  %371 = insertelement <8 x i32> %370, i32 %365, i32 3
+  %372 = insertelement <8 x i32> %371, i32 %366, i32 4
+  %373 = insertelement <8 x i32> %372, i32 %367, i32 5
+  %374 = insertelement <8 x i32> %373, i32 undef, i32 6
+  %375 = insertelement <8 x i32> %374, i32 undef, i32 7
+  %376 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %375, <32 x i8> %70, <16 x i8> %72, i32 2)
+  %377 = extractelement <4 x float> %376, i32 0
+  %378 = extractelement <4 x float> %376, i32 1
+  %379 = extractelement <4 x float> %376, i32 2
+  %380 = extractelement <4 x float> %376, i32 3
+  %381 = fsub float -0.000000e+00, %95
+  %382 = fsub float -0.000000e+00, %96
+  %383 = fsub float -0.000000e+00, %97
+  %384 = fmul float %359, %381
+  %385 = fmul float %360, %382
+  %386 = fadd float %385, %384
+  %387 = fmul float %361, %383
+  %388 = fadd float %386, %387
+  %389 = fmul float %388, %359
+  %390 = fmul float %388, %360
+  %391 = fmul float %388, %361
+  %392 = fmul float 2.000000e+00, %389
+  %393 = fmul float 2.000000e+00, %390
+  %394 = fmul float 2.000000e+00, %391
+  %395 = fsub float -0.000000e+00, %392
+  %396 = fadd float %381, %395
+  %397 = fsub float -0.000000e+00, %393
+  %398 = fadd float %382, %397
+  %399 = fsub float -0.000000e+00, %394
+  %400 = fadd float %383, %399
+  %401 = fmul float %396, %98
+  %402 = fmul float %396, %99
+  %403 = fmul float %396, %100
+  %404 = fmul float %398, %101
+  %405 = fadd float %404, %401
+  %406 = fmul float %398, %102
+  %407 = fadd float %406, %402
+  %408 = fmul float %398, %103
+  %409 = fadd float %408, %403
+  %410 = fmul float %400, %104
+  %411 = fadd float %410, %405
+  %412 = fmul float %400, %105
+  %413 = fadd float %412, %407
+  %414 = fmul float %400, %106
+  %415 = fadd float %414, %409
+  %416 = bitcast float %136 to i32
+  %417 = bitcast float %182 to i32
+  %418 = bitcast float %137 to i32
+  %419 = bitcast float %183 to i32
+  %420 = bitcast float %temp28.1 to i32
+  %421 = bitcast float %temp29.1 to i32
+  %422 = insertelement <8 x i32> undef, i32 %416, i32 0
+  %423 = insertelement <8 x i32> %422, i32 %417, i32 1
+  %424 = insertelement <8 x i32> %423, i32 %418, i32 2
+  %425 = insertelement <8 x i32> %424, i32 %419, i32 3
+  %426 = insertelement <8 x i32> %425, i32 %420, i32 4
+  %427 = insertelement <8 x i32> %426, i32 %421, i32 5
+  %428 = insertelement <8 x i32> %427, i32 undef, i32 6
+  %429 = insertelement <8 x i32> %428, i32 undef, i32 7
+  %430 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %429, <32 x i8> %86, <16 x i8> %88, i32 2)
+  %431 = extractelement <4 x float> %430, i32 0
+  %432 = extractelement <4 x float> %430, i32 1
+  %433 = extractelement <4 x float> %430, i32 2
+  %434 = fmul float %48, %411
+  %435 = fmul float %49, %411
+  %436 = fmul float %50, %411
+  %437 = fmul float %51, %413
+  %438 = fadd float %437, %434
+  %439 = fmul float %52, %413
+  %440 = fadd float %439, %435
+  %441 = fmul float %53, %413
+  %442 = fadd float %441, %436
+  %443 = fmul float %54, %415
+  %444 = fadd float %443, %438
+  %445 = fmul float %55, %415
+  %446 = fadd float %445, %440
+  %447 = fmul float %56, %415
+  %448 = fadd float %447, %442
+  %449 = insertelement <4 x float> undef, float %444, i32 0
+  %450 = insertelement <4 x float> %449, float %446, i32 1
+  %451 = insertelement <4 x float> %450, float %448, i32 2
+  %452 = insertelement <4 x float> %451, float %195, i32 3
+  %453 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %452)
+  %454 = extractelement <4 x float> %453, i32 0
+  %455 = extractelement <4 x float> %453, i32 1
+  %456 = extractelement <4 x float> %453, i32 2
+  %457 = extractelement <4 x float> %453, i32 3
+  %458 = call float @fabs(float %456)
+  %459 = fdiv float 1.000000e+00, %458
+  %460 = fmul float %454, %459
+  %461 = fadd float %460, 1.500000e+00
+  %462 = fmul float %455, %459
+  %463 = fadd float %462, 1.500000e+00
+  %464 = bitcast float %463 to i32
+  %465 = bitcast float %461 to i32
+  %466 = bitcast float %457 to i32
+  %467 = insertelement <4 x i32> undef, i32 %464, i32 0
+  %468 = insertelement <4 x i32> %467, i32 %465, i32 1
+  %469 = insertelement <4 x i32> %468, i32 %466, i32 2
+  %470 = insertelement <4 x i32> %469, i32 undef, i32 3
+  %471 = call <4 x float> @llvm.SI.sample.v4i32(<4 x i32> %470, <32 x i8> %90, <16 x i8> %92, i32 4)
+  %472 = extractelement <4 x float> %471, i32 0
+  %473 = extractelement <4 x float> %471, i32 1
+  %474 = extractelement <4 x float> %471, i32 2
+  %475 = fmul float %431, %472
+  %476 = fadd float %475, %329
+  %477 = fmul float %432, %473
+  %478 = fadd float %477, %330
+  %479 = fmul float %433, %474
+  %480 = fadd float %479, %331
+  %481 = fmul float %107, %107
+  %482 = fmul float %108, %108
+  %483 = fadd float %482, %481
+  %484 = fmul float %109, %109
+  %485 = fadd float %483, %484
+  %486 = call float @llvm.AMDGPU.rsq(float %485)
+  %487 = fmul float %107, %486
+  %488 = fmul float %108, %486
+  %489 = fmul float %109, %486
+  %490 = fmul float %377, %40
+  %491 = fmul float %378, %41
+  %492 = fmul float %379, %42
+  %493 = fmul float %359, %487
+  %494 = fmul float %360, %488
+  %495 = fadd float %494, %493
+  %496 = fmul float %361, %489
+  %497 = fadd float %495, %496
+  %498 = fmul float %497, %359
+  %499 = fmul float %497, %360
+  %500 = fmul float %497, %361
+  %501 = fmul float 2.000000e+00, %498
+  %502 = fmul float 2.000000e+00, %499
+  %503 = fmul float 2.000000e+00, %500
+  %504 = fsub float -0.000000e+00, %501
+  %505 = fadd float %487, %504
+  %506 = fsub float -0.000000e+00, %502
+  %507 = fadd float %488, %506
+  %508 = fsub float -0.000000e+00, %503
+  %509 = fadd float %489, %508
+  %510 = fmul float %95, %95
+  %511 = fmul float %96, %96
+  %512 = fadd float %511, %510
+  %513 = fmul float %97, %97
+  %514 = fadd float %512, %513
+  %515 = call float @llvm.AMDGPU.rsq(float %514)
+  %516 = fmul float %95, %515
+  %517 = fmul float %96, %515
+  %518 = fmul float %97, %515
+  %519 = fmul float %505, %516
+  %520 = fmul float %507, %517
+  %521 = fadd float %520, %519
+  %522 = fmul float %509, %518
+  %523 = fadd float %521, %522
+  %524 = fsub float -0.000000e+00, %523
+  %525 = fcmp uge float %524, 0.000000e+00
+  %526 = select i1 %525, float %524, float 0.000000e+00
+  %527 = fmul float %43, %380
+  %528 = fadd float %527, 1.000000e+00
+  %529 = call float @llvm.pow.f32(float %526, float %528)
+  %530 = fmul float %476, %37
+  %531 = fmul float %478, %38
+  %532 = fmul float %480, %39
+  %533 = fmul float %359, %487
+  %534 = fmul float %360, %488
+  %535 = fadd float %534, %533
+  %536 = fmul float %361, %489
+  %537 = fadd float %535, %536
+  %538 = fcmp uge float %537, 0.000000e+00
+  %539 = select i1 %538, float %537, float 0.000000e+00
+  %540 = fmul float %530, %539
+  %541 = fmul float %531, %539
+  %542 = fmul float %532, %539
+  %543 = fmul float %490, %529
+  %544 = fadd float %543, %540
+  %545 = fmul float %491, %529
+  %546 = fadd float %545, %541
+  %547 = fmul float %492, %529
+  %548 = fadd float %547, %542
+  %549 = fmul float %476, %34
+  %550 = fmul float %478, %35
+  %551 = fmul float %480, %36
+  %552 = fmul float %544, %57
+  %553 = fadd float %552, %549
+  %554 = fmul float %546, %58
+  %555 = fadd float %554, %550
+  %556 = fmul float %548, %59
+  %557 = fadd float %556, %551
+  %558 = bitcast float %136 to i32
+  %559 = bitcast float %182 to i32
+  %560 = bitcast float %137 to i32
+  %561 = bitcast float %183 to i32
+  %562 = bitcast float %temp28.1 to i32
+  %563 = bitcast float %temp29.1 to i32
+  %564 = insertelement <8 x i32> undef, i32 %558, i32 0
+  %565 = insertelement <8 x i32> %564, i32 %559, i32 1
+  %566 = insertelement <8 x i32> %565, i32 %560, i32 2
+  %567 = insertelement <8 x i32> %566, i32 %561, i32 3
+  %568 = insertelement <8 x i32> %567, i32 %562, i32 4
+  %569 = insertelement <8 x i32> %568, i32 %563, i32 5
+  %570 = insertelement <8 x i32> %569, i32 undef, i32 6
+  %571 = insertelement <8 x i32> %570, i32 undef, i32 7
+  %572 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %571, <32 x i8> %74, <16 x i8> %76, i32 2)
+  %573 = extractelement <4 x float> %572, i32 0
+  %574 = extractelement <4 x float> %572, i32 1
+  %575 = extractelement <4 x float> %572, i32 2
+  %576 = fmul float %573, %44
+  %577 = fadd float %576, %553
+  %578 = fmul float %574, %45
+  %579 = fadd float %578, %555
+  %580 = fmul float %575, %46
+  %581 = fadd float %580, %557
+  %582 = call i32 @llvm.SI.packf16(float %577, float %579)
+  %583 = bitcast i32 %582 to float
+  %584 = call i32 @llvm.SI.packf16(float %581, float %283)
+  %585 = bitcast i32 %584 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %583, float %585, float %583, float %585)
+  ret void
+
+ENDIF66:                                          ; preds = %LOOP65
+  %586 = bitcast float %temp28.1 to i32
+  %587 = bitcast float %temp29.1 to i32
+  %588 = insertelement <8 x i32> %237, i32 %586, i32 4
+  %589 = insertelement <8 x i32> %588, i32 %587, i32 5
+  %590 = insertelement <8 x i32> %589, i32 undef, i32 6
+  %591 = insertelement <8 x i32> %590, i32 undef, i32 7
+  %592 = call <4 x float> @llvm.SI.sampled.v8i32(<8 x i32> %591, <32 x i8> %62, <16 x i8> %64, i32 2)
+  %593 = extractelement <4 x float> %592, i32 3
+  %594 = fcmp oge float %temp30.1, %593
+  %595 = sext i1 %594 to i32
+  %596 = bitcast i32 %595 to float
+  %597 = bitcast float %596 to i32
+  %598 = and i32 %597, 1065353216
+  %599 = bitcast i32 %598 to float
+  %600 = fmul float 5.000000e-01, %temp32.0
+  %601 = fsub float -0.000000e+00, %600
+  %602 = fmul float %599, %temp32.0
+  %603 = fadd float %602, %601
+  %604 = fmul float %214, %603
+  %605 = fadd float %604, %temp28.1
+  %606 = fmul float %215, %603
+  %607 = fadd float %606, %temp29.1
+  %608 = fmul float %216, %603
+  %609 = fadd float %608, %temp30.1
+  %610 = fadd float %temp24.1, 1.000000e+00
+  %611 = fmul float %temp32.0, 5.000000e-01
+  br label %LOOP65
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+
+; Function Attrs: readnone
+declare i32 @llvm.SI.tid() #2
+
+; Function Attrs: readonly
+declare float @ceil(float) #3
+
+; Function Attrs: readnone
+declare float @llvm.AMDGPU.rsq(float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: readnone
+declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #2
+
+; Function Attrs: readnone
+declare float @fabs(float) #2
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.sample.v4i32(<4 x i32>, <32 x i8>, <16 x i8>, i32) #1
+
+; Function Attrs: nounwind readonly
+declare float @llvm.pow.f32(float, float) #4
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.SI.packf16(float, float) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="0" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { readnone }
+attributes #3 = { readonly }
+attributes #4 = { nounwind readonly }
+
+!0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/si-vector-hang.ll b/test/CodeGen/R600/si-vector-hang.ll
new file mode 100644
index 0000000..093234f
--- /dev/null
+++ b/test/CodeGen/R600/si-vector-hang.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
+
+; CHECK: @test_8_min_char
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; CHECK: BUFFER_STORE_BYTE
+; ModuleID = 'radeon'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+target triple = "r600--"
+
+; Function Attrs: nounwind
+define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
+entry:
+  %0 = load i8 addrspace(1)* %in0, align 1
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %arrayidx2.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 1
+  %2 = load i8 addrspace(1)* %arrayidx2.i.i, align 1
+  %3 = insertelement <8 x i8> %1, i8 %2, i32 1
+  %arrayidx6.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 2
+  %4 = load i8 addrspace(1)* %arrayidx6.i.i, align 1
+  %5 = insertelement <8 x i8> %3, i8 %4, i32 2
+  %arrayidx10.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 3
+  %6 = load i8 addrspace(1)* %arrayidx10.i.i, align 1
+  %7 = insertelement <8 x i8> %5, i8 %6, i32 3
+  %arrayidx.i.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 4
+  %8 = load i8 addrspace(1)* %arrayidx.i.i, align 1
+  %9 = insertelement <8 x i8> undef, i8 %8, i32 0
+  %arrayidx2.i9.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 5
+  %10 = load i8 addrspace(1)* %arrayidx2.i9.i, align 1
+  %11 = insertelement <8 x i8> %9, i8 %10, i32 1
+  %arrayidx6.i11.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 6
+  %12 = load i8 addrspace(1)* %arrayidx6.i11.i, align 1
+  %13 = insertelement <8 x i8> %11, i8 %12, i32 2
+  %arrayidx10.i13.i = getelementptr inbounds i8 addrspace(1)* %in0, i64 7
+  %14 = load i8 addrspace(1)* %arrayidx10.i13.i, align 1
+  %15 = insertelement <8 x i8> %13, i8 %14, i32 3
+  %vecinit5.i = shufflevector <8 x i8> %7, <8 x i8> %15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  %16 = load i8 addrspace(1)* %in1, align 1
+  %17 = insertelement <8 x i8> undef, i8 %16, i32 0
+  %arrayidx2.i.i4 = getelementptr inbounds i8 addrspace(1)* %in1, i64 1
+  %18 = load i8 addrspace(1)* %arrayidx2.i.i4, align 1
+  %19 = insertelement <8 x i8> %17, i8 %18, i32 1
+  %arrayidx6.i.i5 = getelementptr inbounds i8 addrspace(1)* %in1, i64 2
+  %20 = load i8 addrspace(1)* %arrayidx6.i.i5, align 1
+  %21 = insertelement <8 x i8> %19, i8 %20, i32 2
+  %arrayidx10.i.i6 = getelementptr inbounds i8 addrspace(1)* %in1, i64 3
+  %22 = load i8 addrspace(1)* %arrayidx10.i.i6, align 1
+  %23 = insertelement <8 x i8> %21, i8 %22, i32 3
+  %arrayidx.i.i7 = getelementptr inbounds i8 addrspace(1)* %in1, i64 4
+  %24 = load i8 addrspace(1)* %arrayidx.i.i7, align 1
+  %25 = insertelement <8 x i8> undef, i8 %24, i32 0
+  %arrayidx2.i9.i8 = getelementptr inbounds i8 addrspace(1)* %in1, i64 5
+  %26 = load i8 addrspace(1)* %arrayidx2.i9.i8, align 1
+  %27 = insertelement <8 x i8> %25, i8 %26, i32 1
+  %arrayidx6.i11.i9 = getelementptr inbounds i8 addrspace(1)* %in1, i64 6
+  %28 = load i8 addrspace(1)* %arrayidx6.i11.i9, align 1
+  %29 = insertelement <8 x i8> %27, i8 %28, i32 2
+  %arrayidx10.i13.i10 = getelementptr inbounds i8 addrspace(1)* %in1, i64 7
+  %30 = load i8 addrspace(1)* %arrayidx10.i13.i10, align 1
+  %31 = insertelement <8 x i8> %29, i8 %30, i32 3
+  %vecinit5.i11 = shufflevector <8 x i8> %23, <8 x i8> %31, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  %cmp.i = icmp slt <8 x i8> %vecinit5.i, %vecinit5.i11
+  %cond.i = select <8 x i1> %cmp.i, <8 x i8> %vecinit5.i, <8 x i8> %vecinit5.i11
+  %32 = extractelement <8 x i8> %cond.i, i32 0
+  store i8 %32, i8 addrspace(1)* %out, align 1
+  %33 = extractelement <8 x i8> %cond.i, i32 1
+  %arrayidx2.i.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 1
+  store i8 %33, i8 addrspace(1)* %arrayidx2.i.i.i, align 1
+  %34 = extractelement <8 x i8> %cond.i, i32 2
+  %arrayidx.i.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 2
+  store i8 %34, i8 addrspace(1)* %arrayidx.i.i.i, align 1
+  %35 = extractelement <8 x i8> %cond.i, i32 3
+  %arrayidx2.i6.i.i = getelementptr inbounds i8 addrspace(1)* %out, i64 3
+  store i8 %35, i8 addrspace(1)* %arrayidx2.i6.i.i, align 1
+  %arrayidx.i.i3 = getelementptr inbounds i8 addrspace(1)* %out, i64 4
+  %36 = extractelement <8 x i8> %cond.i, i32 4
+  store i8 %36, i8 addrspace(1)* %arrayidx.i.i3, align 1
+  %37 = extractelement <8 x i8> %cond.i, i32 5
+  %arrayidx2.i.i6.i = getelementptr inbounds i8 addrspace(1)* %out, i64 5
+  store i8 %37, i8 addrspace(1)* %arrayidx2.i.i6.i, align 1
+  %38 = extractelement <8 x i8> %cond.i, i32 6
+  %arrayidx.i.i7.i = getelementptr inbounds i8 addrspace(1)* %out, i64 6
+  store i8 %38, i8 addrspace(1)* %arrayidx.i.i7.i, align 1
+  %39 = extractelement <8 x i8> %cond.i, i32 7
+  %arrayidx2.i6.i8.i = getelementptr inbounds i8 addrspace(1)* %out, i64 7
+  store i8 %39, i8 addrspace(1)* %arrayidx2.i6.i8.i, align 1
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7, !8}
+
+!0 = metadata !{null}
+!1 = metadata !{null}
+!2 = metadata !{null}
+!3 = metadata !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @test_8_min_char}
+!4 = metadata !{null}
+!5 = metadata !{null}
+!6 = metadata !{null}
+!7 = metadata !{null}
+!8 = metadata !{null}
diff --git a/test/CodeGen/R600/sign_extend.ll b/test/CodeGen/R600/sign_extend.ll
index e4ef534..1212cee 100644
--- a/test/CodeGen/R600/sign_extend.ll
+++ b/test/CodeGen/R600/sign_extend.ll
@@ -1,5 +1,5 @@
 
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s
 
 ; CHECK: V_ASHR
 define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c)  {
diff --git a/test/CodeGen/R600/sint_to_fp.ll b/test/CodeGen/R600/sint_to_fp.ll
index 4e88494..9241799 100644
--- a/test/CodeGen/R600/sint_to_fp.ll
+++ b/test/CodeGen/R600/sint_to_fp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK: @sint_to_fp_v2i32
 ; R600-CHECK-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
diff --git a/test/CodeGen/R600/sint_to_fp64.ll b/test/CodeGen/R600/sint_to_fp64.ll
new file mode 100644
index 0000000..5abc9d1
--- /dev/null
+++ b/test/CodeGen/R600/sint_to_fp64.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=CHECK
+
+; CHECK: @sint_to_fp64
+; CHECK: V_CVT_F64_I32_e32
+define void @sint_to_fp64(double addrspace(1)* %out, i32 %in) {
+  %result = sitofp i32 %in to double
+  store double %result, double addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
index 5220a96..fe9df10 100644
--- a/test/CodeGen/R600/sra.ll
+++ b/test/CodeGen/R600/sra.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
-;EG-CHECK: @ashr_v2i32
+;EG-CHECK-LABEL: @ashr_v2i32
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @ashr_v2i32
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @ashr_v2i32
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -18,17 +18,17 @@ define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
   ret void
 }
 
-;EG-CHECK: @ashr_v4i32
+;EG-CHECK-LABEL: @ashr_v4i32
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-;SI-CHECK: @ashr_v4i32
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_ASHR_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK-LABEL: @ashr_v4i32
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_ASHR_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
@@ -39,11 +39,11 @@ define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   ret void
 }
 
-;EG-CHECK: @ashr_i64
+;EG-CHECK-LABEL: @ashr_i64
 ;EG-CHECK: ASHR
 
-;SI-CHECK: @ashr_i64
-;SI-CHECK: V_ASHR_I64
+;SI-CHECK-LABEL: @ashr_i64
+;SI-CHECK: S_ASHR_I64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
 define void @ashr_i64(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
diff --git a/test/CodeGen/R600/srl.ll b/test/CodeGen/R600/srl.ll
index d1dcd7f..7637355 100644
--- a/test/CodeGen/R600/srl.ll
+++ b/test/CodeGen/R600/srl.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @lshr_v2i32
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @lshr_v2i32
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -26,10 +26,10 @@ define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ;EG-CHECK: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @lshr_v4i32
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_LSHR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_LSHR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/store-vector-ptrs.ll b/test/CodeGen/R600/store-vector-ptrs.ll
new file mode 100644
index 0000000..01210ce
--- /dev/null
+++ b/test/CodeGen/R600/store-vector-ptrs.ll
@@ -0,0 +1,8 @@
+; XFAIL: *
+; RUN: llc -march=r600 -mcpu=SI < %s
+
+define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
+  %p = getelementptr <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
+  store <4 x i32*> %p, <4 x i32*>* %out
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/R600/store.ll b/test/CodeGen/R600/store.ll
index 1bda5e6..5e51d56 100644
--- a/test/CodeGen/R600/store.ll
+++ b/test/CodeGen/R600/store.ll
@@ -1,13 +1,118 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+
+;===------------------------------------------------------------------------===;
+; Global Address Space
+;===------------------------------------------------------------------------===;
+
+; i8 store
+; EG-CHECK-LABEL: @store_i8
+; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
+; EG-CHECK: VTX_READ_8 [[VAL:T[0-9]\.X]], [[VAL]]
+; IG 0: Get the byte index and truncate the value
+; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG-CHECK-NEXT: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y
+; EG-CHECK-NEXT: 3(4.203895e-45), 255(3.573311e-43)
+; IG 1: Truncate the calculated the shift amount for the mask
+; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG-CHECK-NEXT: 3
+; IG 2: Shift the value and the mask
+; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]]
+; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
+; EG-CHECK-NEXT: 255
+; IG 3: Initialize the Y and Z channels to zero
+;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
+; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0
+; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0
+
+; SI-CHECK-LABEL: @store_i8
+; SI-CHECK: BUFFER_STORE_BYTE
+
+define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
+entry:
+  store i8 %in, i8 addrspace(1)* %out
+  ret void
+}
+
+; i16 store
+; EG-CHECK-LABEL: @store_i16
+; EG-CHECK: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X
+; EG-CHECK: VTX_READ_16 [[VAL:T[0-9]\.X]], [[VAL]]
+; IG 0: Get the byte index and truncate the value
+; EG-CHECK: AND_INT T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x
+; EG-CHECK: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], [[VAL]], literal.y
+; EG-CHECK-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
+; IG 1: Truncate the calculated the shift amount for the mask
+; EG-CHECK: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x
+; EG-CHECK: 3
+; IG 2: Shift the value and the mask
+; EG-CHECK: LSHL T[[RW_GPR]].X, T{{[0-9]}}.[[TRUNC_CHAN]], PV.[[SHIFT_CHAN]]
+; EG-CHECK: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]]
+; EG-CHECK-NEXT: 65535
+; IG 3: Initialize the Y and Z channels to zero
+;       XXX: An optimal scheduler should merge this into one of the prevous IGs.
+; EG-CHECK: MOV T[[RW_GPR]].Y, 0.0
+; EG-CHECK: MOV * T[[RW_GPR]].Z, 0.0
+
+; SI-CHECK-LABEL: @store_i16
+; SI-CHECK: BUFFER_STORE_SHORT
+define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
+entry:
+  store i16 %in, i16 addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_v2i8
+; EG-CHECK: MEM_RAT MSKOR
+; EG-CHECK-NOT: MEM_RAT MSKOR
+; SI-CHECK-LABEL: @store_v2i8
+; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: BUFFER_STORE_BYTE
+define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i8>
+  store <2 x i8> %0, <2 x i8> addrspace(1)* %out
+  ret void
+}
+
+
+; EG-CHECK-LABEL: @store_v2i16
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; CM-CHECK-LABEL: @store_v2i16
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
+; SI-CHECK-LABEL: @store_v2i16
+; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK: BUFFER_STORE_SHORT
+define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+entry:
+  %0 = trunc <2 x i32> %in to <2 x i16>
+  store <2 x i16> %0, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_v4i8
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; CM-CHECK-LABEL: @store_v4i8
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
+; SI-CHECK-LABEL: @store_v4i8
+; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: BUFFER_STORE_BYTE
+; SI-CHECK: BUFFER_STORE_BYTE
+define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i8>
+  store <4 x i8> %0, <4 x i8> addrspace(1)* %out
+  ret void
+}
 
 ; floating-point store
-; EG-CHECK: @store_f32
-; EG-CHECK: RAT_WRITE_CACHELESS_32_eg T{{[0-9]+\.X, T[0-9]+\.X}}, 1
-; CM-CHECK: @store_f32
-; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
-; SI-CHECK: @store_f32
+; EG-CHECK-LABEL: @store_f32
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.X, T[0-9]+\.X}}, 1
+; CM-CHECK-LABEL: @store_f32
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+\.X, T[0-9]+\.X}}
+; SI-CHECK-LABEL: @store_f32
 ; SI-CHECK: BUFFER_STORE_DWORD
 
 define void @store_f32(float addrspace(1)* %out, float %in) {
@@ -15,22 +120,141 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
   ret void
 }
 
+; EG-CHECK-LABEL: @store_v4i16
+; EG-CHECK: MEM_RAT MSKOR
+; EG-CHECK: MEM_RAT MSKOR
+; EG-CHECK: MEM_RAT MSKOR
+; EG-CHECK: MEM_RAT MSKOR
+; EG-CHECK-NOT: MEM_RAT MSKOR
+; SI-CHECK-LABEL: @store_v4i16
+; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK: BUFFER_STORE_SHORT
+; SI-CHECK-NOT: BUFFER_STORE_BYTE
+define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  %0 = trunc <4 x i32> %in to <4 x i16>
+  store <4 x i16> %0, <4 x i16> addrspace(1)* %out
+  ret void
+}
+
 ; vec2 floating-point stores
-; EG-CHECK: @store_v2f32
-; EG-CHECK: RAT_WRITE_CACHELESS_64_eg
-; CM-CHECK: @store_v2f32
-; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
-; SI-CHECK: @store_v2f32
+; EG-CHECK-LABEL: @store_v2f32
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; CM-CHECK-LABEL: @store_v2f32
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
+; SI-CHECK-LABEL: @store_v2f32
 ; SI-CHECK: BUFFER_STORE_DWORDX2
 
 define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
-  %1 = insertelement <2 x float> %0, float %b, i32 0
+  %1 = insertelement <2 x float> %0, float %b, i32 1
   store <2 x float> %1, <2 x float> addrspace(1)* %out
   ret void
 }
 
+; EG-CHECK-LABEL: @store_v4i32
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; EG-CHECK-NOT: MEM_RAT_CACHELESS STORE_RAW
+; CM-CHECK-LABEL: @store_v4i32
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
+; CM-CHECK-NOT: MEM_RAT_CACHELESS STORE_DWORD
+; SI-CHECK-LABEL: @store_v4i32
+; SI-CHECK: BUFFER_STORE_DWORDX4
+define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+;===------------------------------------------------------------------------===;
+; Local Address Space
+;===------------------------------------------------------------------------===;
+
+; EG-CHECK-LABEL: @store_local_i8
+; EG-CHECK: LDS_BYTE_WRITE
+; SI-CHECK-LABEL: @store_local_i8
+; SI-CHECK: DS_WRITE_B8
+define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
+  store i8 %in, i8 addrspace(3)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_local_i16
+; EG-CHECK: LDS_SHORT_WRITE
+; SI-CHECK-LABEL: @store_local_i16
+; SI-CHECK: DS_WRITE_B16
+define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
+  store i16 %in, i16 addrspace(3)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_local_v2i16
+; EG-CHECK: LDS_WRITE
+; CM-CHECK-LABEL: @store_local_v2i16
+; CM-CHECK: LDS_WRITE
+; SI-CHECK-LABEL: @store_local_v2i16
+; SI-CHECK: DS_WRITE_B16
+; SI-CHECK: DS_WRITE_B16
+define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
+entry:
+  store <2 x i16> %in, <2 x i16> addrspace(3)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_local_v4i8
+; EG-CHECK: LDS_WRITE
+; CM-CHECK-LABEL: @store_local_v4i8
+; CM-CHECK: LDS_WRITE
+; SI-CHECK-LABEL: @store_local_v4i8
+; SI-CHECK: DS_WRITE_B8
+; SI-CHECK: DS_WRITE_B8
+; SI-CHECK: DS_WRITE_B8
+; SI-CHECK: DS_WRITE_B8
+define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+entry:
+  store <4 x i8> %in, <4 x i8> addrspace(3)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_local_v2i32
+; EG-CHECK: LDS_WRITE
+; EG-CHECK: LDS_WRITE
+; CM-CHECK-LABEL: @store_local_v2i32
+; CM-CHECK: LDS_WRITE
+; CM-CHECK: LDS_WRITE
+; SI-CHECK-LABEL: @store_local_v2i32
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_WRITE_B32
+define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
+entry:
+  store <2 x i32> %in, <2 x i32> addrspace(3)* %out
+  ret void
+}
+
+; EG-CHECK-LABEL: @store_local_v4i32
+; EG-CHECK: LDS_WRITE
+; EG-CHECK: LDS_WRITE
+; EG-CHECK: LDS_WRITE
+; EG-CHECK: LDS_WRITE
+; CM-CHECK-LABEL: @store_local_v4i32
+; CM-CHECK: LDS_WRITE
+; CM-CHECK: LDS_WRITE
+; CM-CHECK: LDS_WRITE
+; CM-CHECK: LDS_WRITE
+; SI-CHECK-LABEL: @store_local_v4i32
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_WRITE_B32
+; SI-CHECK: DS_WRITE_B32
+define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+entry:
+  store <4 x i32> %in, <4 x i32> addrspace(3)* %out
+  ret void
+}
+
 ; The stores in this function are combined by the optimizer to create a
 ; 64-bit store with 32-bit alignment.  This is legal for SI and the legalizer
 ; should not try to split the 64-bit store back into 2 32-bit stores.
@@ -38,25 +262,21 @@ entry:
 ; Evergreen / Northern Islands don't support 64-bit stores yet, so there should
 ; be two 32-bit stores.
 
-; EG-CHECK: @vecload2
-; EG-CHECK: RAT_WRITE_CACHELESS_64_eg
-; CM-CHECK: @vecload2
-; CM-CHECK: EXPORT_RAT_INST_STORE_DWORD
-; SI-CHECK: @vecload2
+; EG-CHECK-LABEL: @vecload2
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; CM-CHECK-LABEL: @vecload2
+; CM-CHECK: MEM_RAT_CACHELESS STORE_DWORD
+; SI-CHECK-LABEL: @vecload2
 ; SI-CHECK: BUFFER_STORE_DWORDX2
 define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
-  %0 = load i32 addrspace(2)* %mem, align 4, !tbaa !5
+  %0 = load i32 addrspace(2)* %mem, align 4
   %arrayidx1.i = getelementptr inbounds i32 addrspace(2)* %mem, i64 1
-  %1 = load i32 addrspace(2)* %arrayidx1.i, align 4, !tbaa !5
-  store i32 %0, i32 addrspace(1)* %out, align 4, !tbaa !5
+  %1 = load i32 addrspace(2)* %arrayidx1.i, align 4
+  store i32 %0, i32 addrspace(1)* %out, align 4
   %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %out, i64 1
-  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4, !tbaa !5
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!5 = metadata !{metadata !"int", metadata !6}
-!6 = metadata !{metadata !"omnipotent char", metadata !7}
-!7 = metadata !{metadata !"Simple C/C++ TBAA"}
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/R600/store.r600.ll b/test/CodeGen/R600/store.r600.ll
index 5ffb7f1..00589a0 100644
--- a/test/CodeGen/R600/store.r600.ll
+++ b/test/CodeGen/R600/store.r600.ll
@@ -4,7 +4,7 @@
 
 ; v4i32 store
 ; EG-CHECK: @store_v4i32
-; EG-CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 
 define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %1 = load <4 x i32> addrspace(1) * %in
@@ -14,7 +14,7 @@ define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 
 ; v4f32 store
 ; EG-CHECK: @store_v4f32
-; EG-CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+; EG-CHECK: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
 define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %1 = load <4 x float> addrspace(1) * %in
   store <4 x float> %1, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/R600/structurize.ll b/test/CodeGen/R600/structurize.ll
new file mode 100644
index 0000000..c2acd93
--- /dev/null
+++ b/test/CodeGen/R600/structurize.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood -mattr=disable-irstructurizer | FileCheck %s
+; Test case for a crash in the AMDILCFGStructurizer from a CFG like this:
+;
+;                            entry
+;                           /     \
+;               diamond_head       branch_from
+;                 /      \           |
+;    diamond_false        diamond_true
+;                 \      /
+;                   done
+;
+; When the diamond_true branch had more than 100 instructions.
+;
+;
+
+; CHECK-LABEL: @branch_into_diamond
+; === entry block:
+; CHECK: ALU_PUSH_BEFORE
+; === Branch instruction (IF):
+; CHECK: JUMP
+  ; === branch_from block
+  ; CHECK: ALU
+  ; === Duplicated diamond_true block (There can be more than one ALU clause):
+  ; === XXX: We should be able to optimize this so the basic block is not
+  ; === duplicated.  See comments in
+  ; === AMDGPUCFGStructurizer::improveSimpleJumpintoIf()
+  ; CHECK: ALU
+; === Branch instruction (ELSE):
+; CHECK: ELSE
+  ; === diamond_head block:
+  ; CHECK: ALU_PUSH_BEFORE
+  ; === Branch instruction (IF):
+  ; CHECK: JUMP
+    ; === diamond_true block (There can be more than one ALU clause):
+    ; ALU
+  ; === Branch instruction (ELSE):
+  ; CHECK: ELSE
+    ; === diamond_false block plus implicit ENDIF
+    ; CHECK: ALU_POP_AFTER
+; === Branch instruction (ENDIF):
+; CHECK: POP
+; === done block:
+; CHECK: ALU
+; CHECK: MEM_RAT_CACHELESS
+; CHECK: CF_END
+
+
+define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+entry:
+%0 = icmp ne i32 %a, 0
+  br i1 %0, label %diamond_head, label %branch_from
+
+diamond_head:
+  %1 = icmp ne i32 %a, 1
+  br i1 %1, label %diamond_true, label %diamond_false
+
+branch_from:
+  %2 = add i32 %a, 1
+  br label %diamond_true
+
+diamond_false:
+  %3 = add i32 %a, 2
+  br label %done
+
+diamond_true:
+  %4 = phi i32 [%2, %branch_from], [%a, %diamond_head]
+  ; This block needs to be > 100 ISA instructions to hit the bug,
+  ; so we'll use udiv instructions.
+  %div0 = udiv i32 %a, %b
+  %div1 = udiv i32 %div0, %4
+  %div2 = udiv i32 %div1, 11
+  %div3 = udiv i32 %div2, %a
+  %div4 = udiv i32 %div3, %b
+  %div5 = udiv i32 %div4, %c
+  %div6 = udiv i32 %div5, %div0
+  %div7 = udiv i32 %div6, %div1
+  br label %done
+
+done:
+  %5 = phi i32 [%3, %diamond_false], [%div7, %diamond_true]
+  store i32 %5, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/structurize1.ll b/test/CodeGen/R600/structurize1.ll
new file mode 100644
index 0000000..8c10301
--- /dev/null
+++ b/test/CodeGen/R600/structurize1.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s
+
+; This tests for abug where the AMDILCFGStructurizer was crashing on loops
+; like this:
+;
+; for (i = 0; i < x; i++) {
+;   if (cond0) {
+;     if (cond1) {
+;
+;     } else {
+;
+;     }
+;     if (cond2) {
+;
+;     }
+;   }
+; }
+
+; CHECK-LABEL: @if_inside_loop
+; CHECK: LOOP_START_DX10
+; CHECK: END_LOOP
+define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+entry:
+  br label %for.body
+
+for.body:
+  %0 = phi i32 [0, %entry], [%inc, %for.inc]
+  %val = phi i32 [0, %entry], [%val.for.inc, %for.inc]
+  %inc = add i32 %0, 1
+  %1 = icmp ult i32 10, %a
+  br i1 %1, label %for.inc, label %if.then
+
+if.then:
+  %2 = icmp ne i32 0, %b
+  br i1 %2, label %if.then.true, label %if.then.false
+
+if.then.true:
+  %3 = add i32 %a, %val
+  br label %if
+
+if.then.false:
+  %4 = mul i32 %a, %val
+  br label %if
+
+if:
+  %val.if = phi i32 [%3, %if.then.true], [%4, %if.then.false]
+  %5 = icmp ne i32 0, %c
+  br i1 %5, label %if.true, label %for.inc
+
+if.true:
+  %6 = add i32 %a, %val.if
+  br label %for.inc
+
+for.inc:
+  %val.for.inc = phi i32 [%val, %for.body], [%val.if, %if], [%6, %if.true]
+  %7 = icmp ne i32 0, %d
+  br i1 %7, label %for.body, label %exit
+
+exit:
+  store i32 %val.for.inc, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
index 3bd4cb8..5fdd2b8 100644
--- a/test/CodeGen/R600/sub.ll
+++ b/test/CodeGen/R600/sub.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @test2
-;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test2
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32> addrspace(1)* %in, i32 1
@@ -19,16 +19,16 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 }
 
 ;EG-CHECK: @test4
-;EG-CHECK: SUB_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test4
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_SUB_I32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_SUB_I32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/R600/swizzle-export.ll b/test/CodeGen/R600/swizzle-export.ll
index b2175af..16c3f19 100644
--- a/test/CodeGen/R600/swizzle-export.ll
+++ b/test/CodeGen/R600/swizzle-export.ll
@@ -6,12 +6,12 @@
 ;EG-CHECK: EXPORT T{{[0-9]+}}.XXWX
 ;EG-CHECK: EXPORT T{{[0-9]+}}.XXXW
 
-define void @main() #0 {
+define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = extractelement <4 x float> %reg1, i32 2
+  %3 = extractelement <4 x float> %reg1, i32 3
   %4 = load <4 x float> addrspace(8)* null
   %5 = extractelement <4 x float> %4, i32 1
   %6 = load <4 x float> addrspace(8)* null
@@ -93,14 +93,15 @@ main_body:
 }
 
 ; EG-CHECK: @main2
-; EG-CHECK: T{{[0-9]+}}.ZXY0
+; EG-CHECK: T{{[0-9]+}}.XY__
+; EG-CHECK: T{{[0-9]+}}.YXZ0
 
-define void @main2() #0 {
+define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = call float @llvm.R600.load.input(i32 4)
-  %1 = call float @llvm.R600.load.input(i32 5)
-  %2 = call float @llvm.R600.load.input(i32 6)
-  %3 = call float @llvm.R600.load.input(i32 7)
+  %0 = extractelement <4 x float> %reg1, i32 0
+  %1 = extractelement <4 x float> %reg1, i32 1
+  %2 = fadd float %0, 2.5
+  %3 = fmul float %1, 3.5
   %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %5 = extractelement <4 x float> %4, i32 0
   %6 = call float @llvm.cos.f32(float %5)
@@ -108,27 +109,21 @@ main_body:
   %8 = extractelement <4 x float> %7, i32 0
   %9 = load <4 x float> addrspace(8)* null
   %10 = extractelement <4 x float> %9, i32 1
-  %11 = insertelement <4 x float> undef, float %0, i32 0
-  %12 = insertelement <4 x float> %11, float %1, i32 1
-  %13 = insertelement <4 x float> %12, float %2, i32 2
-  %14 = insertelement <4 x float> %13, float %3, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %14, i32 60, i32 1)
-  %15 = insertelement <4 x float> undef, float %6, i32 0
-  %16 = insertelement <4 x float> %15, float %8, i32 1
-  %17 = insertelement <4 x float> %16, float %10, i32 2
-  %18 = insertelement <4 x float> %17, float 0.000000e+00, i32 3
-  call void @llvm.R600.store.swizzle(<4 x float> %18, i32 0, i32 2)
+  %11 = insertelement <4 x float> undef, float %2, i32 0
+  %12 = insertelement <4 x float> %11, float %3, i32 1
+  call void @llvm.R600.store.swizzle(<4 x float> %12, i32 60, i32 1)
+  %13 = insertelement <4 x float> undef, float %6, i32 0
+  %14 = insertelement <4 x float> %13, float %8, i32 1
+  %15 = insertelement <4 x float> %14, float %10, i32 2
+  %16 = insertelement <4 x float> %15, float 0.000000e+00, i32 3
+  call void @llvm.R600.store.swizzle(<4 x float> %16, i32 0, i32 2)
   ret void
 }
 
-; Function Attrs: readnone
-declare float @llvm.R600.load.input(i32) #1
-
 ; Function Attrs: nounwind readonly
-declare float @llvm.cos.f32(float) #2
+declare float @llvm.cos.f32(float) #1
 
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
 
 attributes #0 = { "ShaderType"="1" }
-attributes #1 = { readnone }
-attributes #2 = { nounwind readonly }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/R600/tex-clause-antidep.ll b/test/CodeGen/R600/tex-clause-antidep.ll
index 5979609..cbb9c50 100644
--- a/test/CodeGen/R600/tex-clause-antidep.ll
+++ b/test/CodeGen/R600/tex-clause-antidep.ll
@@ -3,11 +3,11 @@
 ;CHECK: TEX
 ;CHECK-NEXT: ALU
 
-define void @test() {
-  %1 = call float @llvm.R600.load.input(i32 0)
-  %2 = call float @llvm.R600.load.input(i32 1)
-  %3 = call float @llvm.R600.load.input(i32 2)
-  %4 = call float @llvm.R600.load.input(i32 3)
+define void @test(<4 x float> inreg %reg0) #0 {
+  %1 = extractelement <4 x float> %reg0, i32 0
+  %2 = extractelement <4 x float> %reg0, i32 1
+  %3 = extractelement <4 x float> %reg0, i32 2
+  %4 = extractelement <4 x float> %reg0, i32 3
   %5 = insertelement <4 x float> undef, float %1, i32 0
   %6 = insertelement <4 x float> %5, float %2, i32 1
   %7 = insertelement <4 x float> %6, float %3, i32 2
@@ -19,6 +19,7 @@ define void @test() {
   ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
 declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/texture-input-merge.ll b/test/CodeGen/R600/texture-input-merge.ll
index 5d0ecef..789538a 100644
--- a/test/CodeGen/R600/texture-input-merge.ll
+++ b/test/CodeGen/R600/texture-input-merge.ll
@@ -2,11 +2,11 @@
 
 ;CHECK-NOT: MOV
 
-define void @test() {
-  %1 = call float @llvm.R600.load.input(i32 0)
-  %2 = call float @llvm.R600.load.input(i32 1)
-  %3 = call float @llvm.R600.load.input(i32 2)
-  %4 = call float @llvm.R600.load.input(i32 3)
+define void @test(<4 x float> inreg %reg0) #0 {
+  %1 = extractelement <4 x float> %reg0, i32 0
+  %2 = extractelement <4 x float> %reg0, i32 1
+  %3 = extractelement <4 x float> %reg0, i32 2
+  %4 = extractelement <4 x float> %reg0, i32 3
   %5 = fmul float %1, 3.0
   %6 = fmul float %2, 3.0
   %7 = fmul float %3, 3.0
@@ -25,6 +25,7 @@ define void @test() {
   ret void
 }
 
-declare float @llvm.R600.load.input(i32) readnone
 declare <4 x float> @llvm.R600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone
 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
+
+attributes #0 = { "ShaderType"="1" }
+\ No newline at end of file
diff --git a/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll b/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll
new file mode 100644
index 0000000..ec959c2
--- /dev/null
+++ b/test/CodeGen/R600/trunc-vector-store-assertion-failure.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This tests for a bug in the SelectionDAG where custom lowered truncated
+; vector stores at the end of a basic block were not being added to the
+; LegalizedNodes list, which triggered an assertion failure.
+
+; CHECK-LABEL: @test
+; CHECK: MEM_RAT_CACHELESS STORE_RAW
+define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
+entry:
+  %0 = icmp eq i32 %cond, 0
+  br i1 %0, label %if, label %done
+
+if:
+  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
+  br label %done
+
+done:
+  ret void
+}
diff --git a/test/CodeGen/R600/trunc.ll b/test/CodeGen/R600/trunc.ll
new file mode 100644
index 0000000..0bd320a
--- /dev/null
+++ b/test/CodeGen/R600/trunc.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
+
+define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
+; SI-LABEL: @trunc_i64_to_i32_store
+; SI: S_LOAD_DWORD s0, s[0:1], 11
+; SI: V_MOV_B32_e32 v0, s0
+; SI: BUFFER_STORE_DWORD v0
+
+; EG-LABEL: @trunc_i64_to_i32_store
+; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG: LSHR
+; EG-NEXT: 2(
+
+  %result = trunc i64 %in to i32 store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; SI-LABEL: @trunc_shl_i64:
+; SI: S_LOAD_DWORDX2
+; SI: S_LOAD_DWORDX2 [[SREG:s\[[0-9]+:[0-9]+\]]]
+; SI: S_LSHL_B64 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, [[SREG]], 2
+; SI: MOV_B32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
+; SI: BUFFER_STORE_DWORD v[[LO_VREG]],
+define void @trunc_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+  %b = shl i64 %a, 2
+  %result = trunc i64 %b to i32
+  store i32 %result, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/udiv.ll b/test/CodeGen/R600/udiv.ll
index 08fe2ef..5371321 100644
--- a/test/CodeGen/R600/udiv.ll
+++ b/test/CodeGen/R600/udiv.ll
@@ -1,13 +1,26 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
+
+;EG-CHECK-LABEL: @test
+;EG-CHECK-NOT: SETGE_INT
+;EG-CHECK: CF_END
+
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %b_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %a = load i32 addrspace(1) * %in
+  %b = load i32 addrspace(1) * %b_ptr
+  %result = udiv i32 %a, %b
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
 
 ;The code generated by udiv is long and complex and may frequently change.
 ;The goal of this test is to make sure the ISel doesn't fail when it gets
 ;a v4i32 udiv
 
-;EG-CHECK: @test2
+;EG-CHECK-LABEL: @test2
 ;EG-CHECK: CF_END
-;SI-CHECK: @test2
+;SI-CHECK-LABEL: @test2
 ;SI-CHECK: S_ENDPGM
 
 define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
@@ -19,9 +32,9 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   ret void
 }
 
-;EG-CHECK: @test4
+;EG-CHECK-LABEL: @test4
 ;EG-CHECK: CF_END
-;SI-CHECK: @test4
+;SI-CHECK-LABEL: @test4
 ;SI-CHECK: S_ENDPGM
 
 define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
diff --git a/test/CodeGen/R600/uint_to_fp.ll b/test/CodeGen/R600/uint_to_fp.ll
index faac77a..a5ac355 100644
--- a/test/CodeGen/R600/uint_to_fp.ll
+++ b/test/CodeGen/R600/uint_to_fp.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
-; R600-CHECK: @uint_to_fp_v2i32
+; R600-CHECK-LABEL: @uint_to_fp_v2i32
 ; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-CHECK-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-; SI-CHECK: @uint_to_fp_v2i32
+; SI-CHECK-LABEL: @uint_to_fp_v2i32
 ; SI-CHECK: V_CVT_F32_U32_e32
 ; SI-CHECK: V_CVT_F32_U32_e32
 define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
@@ -13,12 +13,12 @@ define void @uint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) {
   ret void
 }
 
-; R600-CHECK: @uint_to_fp_v4i32
+; R600-CHECK-LABEL: @uint_to_fp_v4i32
 ; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600-CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-; SI-CHECK: @uint_to_fp_v4i32
+; SI-CHECK-LABEL: @uint_to_fp_v4i32
 ; SI-CHECK: V_CVT_F32_U32_e32
 ; SI-CHECK: V_CVT_F32_U32_e32
 ; SI-CHECK: V_CVT_F32_U32_e32
@@ -29,3 +29,18 @@ define void @uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspac
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
+
+; R600-CHECK-LABEL: @uint_to_fp_i64_f32
+; R600-CHECK: UINT_TO_FLT
+; R600-CHECK: UINT_TO_FLT
+; R600-CHECK: MULADD_IEEE
+; SI-CHECK-LABEL: @uint_to_fp_i64_f32
+; SI-CHECK: V_CVT_F32_U32_e32
+; SI-CHECK: V_CVT_F32_U32_e32
+; SI-CHECK: V_MAD_F32
+define void @uint_to_fp_i64_f32(float addrspace(1)* %out, i64 %in) {
+entry:
+  %0 = uitofp i64 %in to float
+  store float %0, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/unaligned-load-store.ll b/test/CodeGen/R600/unaligned-load-store.ll
new file mode 100644
index 0000000..2824ff8
--- /dev/null
+++ b/test/CodeGen/R600/unaligned-load-store.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: @unaligned_load_store_i32:
+; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
+define void @unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) nounwind {
+  %v = load i32 addrspace(3)* %p, align 1
+  store i32 %v, i32 addrspace(3)* %r, align 1
+  ret void
+}
+
+; SI-LABEL: @unaligned_load_store_v4i32:
+; DS_READ_U32 {{v[0-9]+}}, 0, [[REG]]
+define void @unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) nounwind {
+  %v = load <4 x i32> addrspace(3)* %p, align 1
+  store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
+  ret void
+}
diff --git a/test/CodeGen/R600/unsupported-cc.ll b/test/CodeGen/R600/unsupported-cc.ll
index cf29833..f986a02 100644
--- a/test/CodeGen/R600/unsupported-cc.ll
+++ b/test/CodeGen/R600/unsupported-cc.ll
@@ -2,8 +2,9 @@
 
 ; These tests are for condition codes that are not supported by the hardware
 
-; CHECK: @slt
-; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @slt
+; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -13,8 +14,9 @@ entry:
   ret void
 }
 
-; CHECK: @ult_i32
-; CHECK: SETGT_UINT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @ult_i32
+; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 5(7.006492e-45)
 define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -24,9 +26,11 @@ entry:
   ret void
 }
 
-; CHECK: @ult_float
-; CHECK: SETGT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @ult_float
+; CHECK: SETGE * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
 ; CHECK-NEXT: 1084227584(5.000000e+00)
+; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
+; CHECK-NEXT: LSHR *
 define void @ult_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
@@ -35,9 +39,22 @@ entry:
   ret void
 }
 
-; CHECK: @olt
-; CHECK: SETGT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
-;CHECK-NEXT: 1084227584(5.000000e+00)
+; CHECK-LABEL: @ult_float_native
+; CHECK: SETGE T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @ult_float_native(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ult float %in, 5.0
+  %1 = select i1 %0, float 0.0, float 1.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @olt
+; CHECK: SETGT T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
 define void @olt(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
@@ -46,8 +63,9 @@ entry:
   ret void
 }
 
-; CHECK: @sle
-; CHECK: SETGT_INT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @sle
+; CHECK: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -57,8 +75,9 @@ entry:
   ret void
 }
 
-; CHECK: @ule_i32
-; CHECK: SETGT_UINT * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @ule_i32
+; CHECK: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR
 ; CHECK-NEXT: 6(8.407791e-45)
 define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
@@ -68,9 +87,11 @@ entry:
   ret void
 }
 
-; CHECK: @ule_float
-; CHECK: SETGE * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @ule_float
+; CHECK: SETGT * T{{[0-9]}}.[[CHAN:[XYZW]]], KC0[2].Z, literal.x
 ; CHECK-NEXT: 1084227584(5.000000e+00)
+; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
+; CHECK-NEXT: LSHR *
 define void @ule_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
@@ -79,8 +100,21 @@ entry:
   ret void
 }
 
-; CHECK: @ole
-; CHECK: SETGE * T{{[0-9]+\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-LABEL: @ule_float_native
+; CHECK: SETGT T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.x
+; CHECK-NEXT: LSHR *
+; CHECK-NEXT: 1084227584(5.000000e+00)
+define void @ule_float_native(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fcmp ule float %in, 5.0
+  %1 = select i1 %0, float 0.0, float 1.0
+  store float %1, float addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @ole
+; CHECK: SETGE T{{[0-9]\.[XYZW]}}, literal.x, KC0[2].Z
+; CHECK-NEXT: LSHR *
 ; CHECK-NEXT:1084227584(5.000000e+00)
 define void @ole(float addrspace(1)* %out, float %in) {
 entry:
diff --git a/test/CodeGen/R600/urecip.ll b/test/CodeGen/R600/urecip.ll
index dad02dd..e808e3d 100644
--- a/test/CodeGen/R600/urecip.ll
+++ b/test/CodeGen/R600/urecip.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck %s
 
 ;CHECK: V_RCP_IFLAG_F32_e32
 
diff --git a/test/CodeGen/R600/urem.ll b/test/CodeGen/R600/urem.ll
index cf3474c..8045145 100644
--- a/test/CodeGen/R600/urem.ll
+++ b/test/CodeGen/R600/urem.ll
@@ -1,5 +1,5 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;The code generated by urem is long and complex and may frequently change.
 ;The goal of this test is to make sure the ISel doesn't fail when it gets
diff --git a/test/CodeGen/R600/vertex-fetch-encoding.ll b/test/CodeGen/R600/vertex-fetch-encoding.ll
index d892229..7ea7a5c 100644
--- a/test/CodeGen/R600/vertex-fetch-encoding.ll
+++ b/test/CodeGen/R600/vertex-fetch-encoding.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=barts | FileCheck --check-prefix=NI-CHECK %s
-; RUN: not llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=cayman | FileCheck --check-prefix=CM-CHECK %s
 
 ; NI-CHECK: @vtx_fetch32
 ; NI-CHECK: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll
index 72a9084..dca7b06 100644
--- a/test/CodeGen/R600/vselect.ll
+++ b/test/CodeGen/R600/vselect.ll
@@ -1,9 +1,9 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @test_select_v2i32
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test_select_v2i32
 ;SI-CHECK: V_CNDMASK_B32_e64
@@ -20,8 +20,8 @@ entry:
 }
 
 ;EG-CHECK: @test_select_v2f32
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test_select_v2f32
 ;SI-CHECK: V_CNDMASK_B32_e64
@@ -31,17 +31,17 @@ define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrs
 entry:
   %0 = load <2 x float> addrspace(1)* %in0
   %1 = load <2 x float> addrspace(1)* %in1
-  %cmp = fcmp one <2 x float> %0, %1
+  %cmp = fcmp une <2 x float> %0, %1
   %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ;EG-CHECK: @test_select_v4i32
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @test_select_v4i32
 ;SI-CHECK: V_CNDMASK_B32_e64
@@ -60,16 +60,16 @@ entry:
 }
 
 ;EG-CHECK: @test_select_v4f32
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: CNDE_INT {{\*? *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <4 x float> addrspace(1)* %in0
   %1 = load <4 x float> addrspace(1)* %in1
-  %cmp = fcmp one <4 x float> %0, %1
+  %cmp = fcmp une <4 x float> %0, %1
   %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/R600/vselect64.ll b/test/CodeGen/R600/vselect64.ll
new file mode 100644
index 0000000..604695b
--- /dev/null
+++ b/test/CodeGen/R600/vselect64.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck  %s
+; XXX: Merge this test into vselect.ll once SI supports 64-bit select.
+
+; CHECK-LABEL: @test_select_v4i64
+; Make sure the vectors aren't being stored on the stack.  We know they are
+; being stored on the stack if the shaders uses at leat 10 registers.
+; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
+define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
+entry:
+       %cmp = icmp ne  <4 x i32> %c, <i32 0, i32 0, i32 0, i32 0>
+       %result = select <4 x i1> %cmp, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> <i64 4, i64 5, i64 6, i64 7>
+       store <4 x i64> %result, <4 x i64> addrspace(1)* %out
+       ret void
+}
+
diff --git a/test/CodeGen/R600/wait.ll b/test/CodeGen/R600/wait.ll
new file mode 100644
index 0000000..2cf88fe
--- /dev/null
+++ b/test/CodeGen/R600/wait.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=r600 -mcpu=SI --verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: @main
+;CHECK: S_WAITCNT lgkmcnt(0)
+;CHECK: S_WAITCNT vmcnt(0)
+;CHECK: S_WAITCNT expcnt(0) lgkmcnt(0)
+
+define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, i32 inreg, i32, i32, i32, i32) #0 {
+main_body:
+  %10 = getelementptr <16 x i8> addrspace(2)* %3, i32 0
+  %11 = load <16 x i8> addrspace(2)* %10, !tbaa !0
+  %12 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %11, i32 0, i32 %6)
+  %13 = extractelement <4 x float> %12, i32 0
+  %14 = extractelement <4 x float> %12, i32 1
+  %15 = extractelement <4 x float> %12, i32 2
+  %16 = extractelement <4 x float> %12, i32 3
+  %17 = getelementptr <16 x i8> addrspace(2)* %3, i32 1
+  %18 = load <16 x i8> addrspace(2)* %17, !tbaa !0
+  %19 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %18, i32 0, i32 %6)
+  %20 = extractelement <4 x float> %19, i32 0
+  %21 = extractelement <4 x float> %19, i32 1
+  %22 = extractelement <4 x float> %19, i32 2
+  %23 = extractelement <4 x float> %19, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %20, float %21, float %22, float %23)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %13, float %14, float %15, float %16)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="1" }
+attributes #1 = { nounwind readnone }
+
+!0 = metadata !{metadata !"const", null, i32 1}
diff --git a/test/CodeGen/R600/work-item-intrinsics.ll b/test/CodeGen/R600/work-item-intrinsics.ll
index 7998983..9618d7f 100644
--- a/test/CodeGen/R600/work-item-intrinsics.ll
+++ b/test/CodeGen/R600/work-item-intrinsics.ll
@@ -1,12 +1,12 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600-CHECK %s
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck --check-prefix=SI-CHECK %s
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ; R600-CHECK: @ngroups_x
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[0].X
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[0].X
 ; SI-CHECK: @ngroups_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 0
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 0
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
@@ -16,11 +16,11 @@ entry:
 }
 
 ; R600-CHECK: @ngroups_y
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[0].Y
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[0].Y
 ; SI-CHECK: @ngroups_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 1
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 1
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
@@ -30,11 +30,11 @@ entry:
 }
 
 ; R600-CHECK: @ngroups_z
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[0].Z
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[0].Z
 ; SI-CHECK: @ngroups_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 2
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 2
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
@@ -44,11 +44,11 @@ entry:
 }
 
 ; R600-CHECK: @global_size_x
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[0].W
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[0].W
 ; SI-CHECK: @global_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 3
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 3
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_x (i32 addrspace(1)* %out) {
 entry:
@@ -58,11 +58,11 @@ entry:
 }
 
 ; R600-CHECK: @global_size_y
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[1].X
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[1].X
 ; SI-CHECK: @global_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 4
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 4
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_y (i32 addrspace(1)* %out) {
 entry:
@@ -72,11 +72,11 @@ entry:
 }
 
 ; R600-CHECK: @global_size_z
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[1].Y
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[1].Y
 ; SI-CHECK: @global_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 5
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 5
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @global_size_z (i32 addrspace(1)* %out) {
 entry:
@@ -86,11 +86,11 @@ entry:
 }
 
 ; R600-CHECK: @local_size_x
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[1].Z
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[1].Z
 ; SI-CHECK: @local_size_x
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 6
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 6
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_x (i32 addrspace(1)* %out) {
 entry:
@@ -100,11 +100,11 @@ entry:
 }
 
 ; R600-CHECK: @local_size_y
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[1].W
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[1].W
 ; SI-CHECK: @local_size_y
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 7
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 7
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_y (i32 addrspace(1)* %out) {
 entry:
@@ -114,11 +114,11 @@ entry:
 }
 
 ; R600-CHECK: @local_size_z
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg [[VAL:T[0-9]+\.X]]
-; R600-CHECK: MOV * [[VAL]], KC0[2].X
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
+; R600-CHECK: MOV [[VAL]], KC0[2].X
 ; SI-CHECK: @local_size_z
-; SI-CHECK: S_LOAD_DWORD [[VAL:SGPR[0-9]+]], SGPR0_SGPR1, 8
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], [[VAL]]
+; SI-CHECK: S_LOAD_DWORD [[VAL:s[0-9]+]], s[0:1], 8
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @local_size_z (i32 addrspace(1)* %out) {
 entry:
@@ -127,12 +127,12 @@ entry:
   ret void
 }
 
-; The tgid values are stored in SGPRs offset by the number of user SGPRs.
-; Currently we always use exactly 2 user SGPRs for the pointer to the
+; The tgid values are stored in ss offset by the number of user ss.
+; Currently we always use exactly 2 user ss for the pointer to the
 ; kernel arguments, but this may change in the future.
 
 ; SI-CHECK: @tgid_x
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], SGPR2
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s2
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @tgid_x (i32 addrspace(1)* %out) {
 entry:
@@ -142,7 +142,7 @@ entry:
 }
 
 ; SI-CHECK: @tgid_y
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], SGPR3
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s3
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @tgid_y (i32 addrspace(1)* %out) {
 entry:
@@ -152,7 +152,7 @@ entry:
 }
 
 ; SI-CHECK: @tgid_z
-; SI-CHECK: V_MOV_B32_e32 [[VVAL:VGPR[0-9]+]], SGPR4
+; SI-CHECK: V_MOV_B32_e32 [[VVAL:v[0-9]+]], s4
 ; SI-CHECK: BUFFER_STORE_DWORD [[VVAL]]
 define void @tgid_z (i32 addrspace(1)* %out) {
 entry:
@@ -162,7 +162,7 @@ entry:
 }
 
 ; SI-CHECK: @tidig_x
-; SI-CHECK: BUFFER_STORE_DWORD VGPR0
+; SI-CHECK: BUFFER_STORE_DWORD v0
 define void @tidig_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
@@ -171,7 +171,7 @@ entry:
 }
 
 ; SI-CHECK: @tidig_y
-; SI-CHECK: BUFFER_STORE_DWORD VGPR1
+; SI-CHECK: BUFFER_STORE_DWORD v1
 define void @tidig_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
@@ -180,7 +180,7 @@ entry:
 }
 
 ; SI-CHECK: @tidig_z
-; SI-CHECK: BUFFER_STORE_DWORD VGPR2
+; SI-CHECK: BUFFER_STORE_DWORD v2
 define void @tidig_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
diff --git a/test/CodeGen/R600/wrong-transalu-pos-fix.ll b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
new file mode 100644
index 0000000..b1cbe3f
--- /dev/null
+++ b/test/CodeGen/R600/wrong-transalu-pos-fix.ll
@@ -0,0 +1,86 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; We want all MULLO_INT inst to be last in their instruction group
+;CHECK: @fill3d
+;CHECK-NOT: MULLO_INT T[0-9]+
+
+; ModuleID = 'radeon'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
+target triple = "r600--"
+
+; Function Attrs: nounwind
+define void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
+entry:
+  %x.i = tail call i32 @llvm.r600.read.global.size.x() #1
+  %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1
+  %mul = mul i32 %y.i18, %x.i
+  %z.i17 = tail call i32 @llvm.r600.read.global.size.z() #1
+  %mul3 = mul i32 %mul, %z.i17
+  %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1
+  %x.i12.i = tail call i32 @llvm.r600.read.local.size.x() #1
+  %mul26.i = mul i32 %x.i12.i, %x.i.i
+  %x.i4.i = tail call i32 @llvm.r600.read.tidig.x() #1
+  %add.i16 = add i32 %x.i4.i, %mul26.i
+  %mul7 = mul i32 %add.i16, %y.i18
+  %y.i.i = tail call i32 @llvm.r600.read.tgid.y() #1
+  %y.i14.i = tail call i32 @llvm.r600.read.local.size.y() #1
+  %mul30.i = mul i32 %y.i14.i, %y.i.i
+  %y.i6.i = tail call i32 @llvm.r600.read.tidig.y() #1
+  %add.i14 = add i32 %mul30.i, %mul7
+  %mul819 = add i32 %add.i14, %y.i6.i
+  %add = mul i32 %mul819, %z.i17
+  %z.i.i = tail call i32 @llvm.r600.read.tgid.z() #1
+  %z.i16.i = tail call i32 @llvm.r600.read.local.size.z() #1
+  %mul33.i = mul i32 %z.i16.i, %z.i.i
+  %z.i8.i = tail call i32 @llvm.r600.read.tidig.z() #1
+  %add.i = add i32 %z.i8.i, %mul33.i
+  %add13 = add i32 %add.i, %add
+  %arrayidx = getelementptr inbounds i32 addrspace(1)* %out, i32 %add13
+  store i32 %mul3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tgid.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.local.size.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.tidig.z() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.x() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.y() #1
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.r600.read.global.size.z() #1
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!opencl.kernels = !{!0, !1, !2}
+
+!0 = metadata !{null}
+!1 = metadata !{null}
+!2 = metadata !{void (i32 addrspace(1)*)* @fill3d}
diff --git a/test/CodeGen/R600/xor.ll b/test/CodeGen/R600/xor.ll
index f52729d..c12b0c1 100644
--- a/test/CodeGen/R600/xor.ll
+++ b/test/CodeGen/R600/xor.ll
@@ -1,13 +1,13 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
-;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s
+;RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI-CHECK %s
 
 ;EG-CHECK: @xor_v2i32
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @xor_v2i32
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
 
 define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
@@ -19,16 +19,16 @@ define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 }
 
 ;EG-CHECK: @xor_v4i32
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-;EG-CHECK: XOR_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ;SI-CHECK: @xor_v4i32
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
-;SI-CHECK: V_XOR_B32_e32 VGPR{{[0-9]+, VGPR[0-9]+, VGPR[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+;SI-CHECK: V_XOR_B32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
   %a = load <4 x i32> addrspace(1) * %in0
@@ -37,3 +37,20 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
+
+;EG-CHECK: @xor_i1
+;EG-CHECK: XOR_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], PS}}
+
+;SI-CHECK: @xor_i1
+;SI-CHECK: S_XOR_B64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+
+define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+  %a = load float addrspace(1) * %in0
+  %b = load float addrspace(1) * %in1
+  %acmp = fcmp oge float %a, 0.000000e+00
+  %bcmp = fcmp oge float %b, 0.000000e+00
+  %xor = xor i1 %acmp, %bcmp
+  %result = select i1 %xor, float %a, float %b
+  store float %result, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/zero_extend.ll b/test/CodeGen/R600/zero_extend.ll
index 413b849..481b3b3 100644
--- a/test/CodeGen/R600/zero_extend.ll
+++ b/test/CodeGen/R600/zero_extend.ll
@@ -1,13 +1,13 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600-CHECK
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s --check-prefix=SI-CHECK
+; RUN: llc < %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI-CHECK
 
 ; R600-CHECK: @test
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg
-; R600-CHECK: RAT_WRITE_CACHELESS_32_eg
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
+; R600-CHECK: MEM_RAT_CACHELESS STORE_RAW
 
 ; SI-CHECK: @test
-; SI-CHECK: V_MOV_B32_e32 [[ZERO:VGPR[0-9]]], 0
-; SI-CHECK: BUFFER_STORE_DWORDX2 VGPR0_[[ZERO]]
+; SI-CHECK: V_MOV_B32_e32 v[[ZERO:[0-9]]], 0
+; SI-CHECK: BUFFER_STORE_DWORDX2 v[0:[[ZERO]]{{\]}}
 define void @test(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = mul i32 %a, %b
diff --git a/test/CodeGen/SI/sanity.ll b/test/CodeGen/SI/sanity.ll
deleted file mode 100644
index 62cdcf5..0000000
--- a/test/CodeGen/SI/sanity.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
-
-; CHECK: S_ENDPGM
-
-define void @main() {
-main_body:
-  call void @llvm.AMDGPU.shader.type(i32 1)
-  %0 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
-  %1 = getelementptr <4 x i32> addrspace(2)* %0, i32 0
-  %2 = load <4 x i32> addrspace(2)* %1
-  %3 = call i32 @llvm.SI.vs.load.buffer.index()
-  %4 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %2, i32 0, i32 %3)
-  %5 = extractelement <4 x float> %4, i32 0
-  %6 = extractelement <4 x float> %4, i32 1
-  %7 = extractelement <4 x float> %4, i32 2
-  %8 = extractelement <4 x float> %4, i32 3
-  %9 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
-  %10 = getelementptr <4 x i32> addrspace(2)* %9, i32 1
-  %11 = load <4 x i32> addrspace(2)* %10
-  %12 = call i32 @llvm.SI.vs.load.buffer.index()
-  %13 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %11, i32 0, i32 %12)
-  %14 = extractelement <4 x float> %13, i32 0
-  %15 = extractelement <4 x float> %13, i32 1
-  %16 = extractelement <4 x float> %13, i32 2
-  %17 = extractelement <4 x float> %13, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %14, float %15, float %16, float %17)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %5, float %6, float %7, float %8)
-  ret void
-}
-
-declare void @llvm.AMDGPU.shader.type(i32)
-
-declare i32 @llvm.SI.vs.load.buffer.index() readnone
-
-declare <4 x float> @llvm.SI.vs.load.input(<4 x i32>, i32, i32)
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/SPARC/2011-01-11-CC.ll b/test/CodeGen/SPARC/2011-01-11-CC.ll
index edbcb49..50f3a65 100644
--- a/test/CodeGen/SPARC/2011-01-11-CC.ll
+++ b/test/CodeGen/SPARC/2011-01-11-CC.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=sparc <%s | FileCheck %s -check-prefix=V8
 ; RUN: llc -march=sparc -mattr=v9 <%s | FileCheck %s -check-prefix=V9
+; RUN: llc -mtriple=sparc64-unknown-linux <%s | FileCheck %s -check-prefix=SPARC64
 
 
 define i32 @test_addx(i64 %a, i64 %b, i64 %c) nounwind readnone noinline {
@@ -65,9 +66,11 @@ define i32 @test_select_int_fcc(float %f, i32 %a, i32 %b) nounwind readnone noin
 entry:
 ;V8-LABEL: test_select_int_fcc:
 ;V8: fcmps
+;V8-NEXT: nop
 ;V8: {{fbe|fbne}}
 ;V9-LABEL: test_select_int_fcc:
 ;V9: fcmps
+;V9-NEXT-NOT: nop
 ;V9-NOT: {{fbe|fbne}}
 ;V9: mov{{e|ne}} %fcc0
   %0 = fcmp une float %f, 0.000000e+00
@@ -94,12 +97,95 @@ define double @test_select_dfp_fcc(double %f, double %f1, double %f2) nounwind r
 entry:
 ;V8-LABEL: test_select_dfp_fcc:
 ;V8: fcmpd
+;V8-NEXT: nop
 ;V8: {{fbne|fbe}}
 ;V9-LABEL: test_select_dfp_fcc:
 ;V9: fcmpd
+;V9-NEXT-NOT: nop
 ;V9-NOT: {{fbne|fbe}}
 ;V9: fmovd{{e|ne}} %fcc0
   %0 = fcmp une double %f, 0.000000e+00
   %1 = select i1 %0, double %f1, double %f2
   ret double %1
 }
+
+define i32 @test_float_cc(double %a, double %b, i32 %c, i32 %d) {
+entry:
+; V8-LABEL: test_float_cc
+; V8:       fcmpd
+; V8:       {{fbl|fbuge}} .LBB
+; V8:       fcmpd
+; V8:       {{fbule|fbg}} .LBB
+
+; V9-LABEL: test_float_cc
+; V9:       fcmpd
+; V9:       {{fbl|fbuge}} .LBB
+; V9:       fcmpd
+; V9:       {{fbule|fbg}} .LBB
+
+   %0 = fcmp uge double %a, 0.000000e+00
+   br i1 %0, label %loop, label %loop.2
+
+loop:
+   %1 = icmp eq i32 %c, 10
+   br i1 %1, label %loop, label %exit.0
+
+loop.2:
+   %2 = fcmp ogt double %b, 0.000000e+00
+   br i1 %2, label %exit.1, label %loop
+
+exit.0:
+   ret i32 0
+
+exit.1:
+   ret i32 1
+}
+
+; V8-LABEL: test_adde_sube
+; V8:       addcc
+; V8:       addxcc
+; V8:       addxcc
+; V8:       addxcc
+; V8:       subcc
+; V8:       subxcc
+; V8:       subxcc
+; V8:       subxcc
+
+
+; V9-LABEL: test_adde_sube
+; V9:       addcc
+; V9:       addxcc
+; V9:       addxcc
+; V9:       addxcc
+; V9:       subcc
+; V9:       subxcc
+; V9:       subxcc
+; V9:       subxcc
+
+; SPARC64-LABEL: test_adde_sube
+; SPARC64:       addcc
+; SPARC64:       addxcc
+; SPARC64:       addxcc
+; SPARC64:       addxcc
+; SPARC64:       subcc
+; SPARC64:       subxcc
+; SPARC64:       subxcc
+; SPARC64:       subxcc
+
+
+define void @test_adde_sube(i8* %a, i8* %b, i8* %sum, i8* %diff) {
+entry:
+   %0 = bitcast i8* %a to i128*
+   %1 = bitcast i8* %b to i128*
+   %2 = load i128* %0
+   %3 = load i128* %1
+   %4 = add i128 %2, %3
+   %5 = bitcast i8* %sum to i128*
+   store i128 %4, i128* %5
+   tail call void asm sideeffect "", "=*m,*m"(i128 *%0, i128* %5) nounwind
+   %6 = load i128* %0
+   %7 = sub i128 %2, %6
+   %8 = bitcast i8* %diff to i128*
+   store i128 %7, i128* %8
+   ret void
+}
diff --git a/test/CodeGen/SPARC/2011-01-11-Call.ll b/test/CodeGen/SPARC/2011-01-11-Call.ll
index 7350e92..a0f478e 100644
--- a/test/CodeGen/SPARC/2011-01-11-Call.ll
+++ b/test/CodeGen/SPARC/2011-01-11-Call.ll
@@ -1,4 +1,24 @@
 ; RUN: llc -march=sparc -O0 <%s
+; RUN: llc -march=sparc   <%s | FileCheck %s --check-prefix=V8
+; RUN: llc -march=sparcv9 <%s | FileCheck %s --check-prefix=V9
+
+; V8-LABEL: test
+; V8:       save %sp
+; V8:       call foo
+; V8-NEXT:  nop
+; V8:       call bar
+; V8-NEXT:  nop
+; V8:       jmp %i7+8
+; V8-NEXT:  restore
+
+; V9-LABEL: test
+; V9:       save %sp
+; V9:       call foo
+; V9-NEXT:  nop
+; V9:       call bar
+; V9-NEXT:  nop
+; V9:       jmp %i7+8
+; V9-NEXT:  restore
 
 define void @test() nounwind {
 entry:
@@ -11,3 +31,23 @@ declare i32 @foo(...)
 
 declare void @bar(...)
 
+
+; V8-LABEL: test_tail_call_with_return
+; V8:       save %sp
+; V8:       call foo
+; V8-NEXT:  nop
+; V8:       jmp %i7+8
+; V8-NEXT:  restore %g0, %o0, %o0
+
+; V9-LABEL: test_tail_call_with_return
+; V9:       save %sp
+; V9:       call foo
+; V9-NEXT:  nop
+; V9:       jmp %i7+8
+; V9-NEXT:  restore %g0, %o0, %o0
+
+define i32 @test_tail_call_with_return() nounwind {
+entry:
+ %0 = tail call i32 (...)* @foo() nounwind
+ ret i32 %0
+}
diff --git a/test/CodeGen/SPARC/2013-05-17-CallFrame.ll b/test/CodeGen/SPARC/2013-05-17-CallFrame.ll
index 9e9e821..81f586f 100644
--- a/test/CodeGen/SPARC/2013-05-17-CallFrame.ll
+++ b/test/CodeGen/SPARC/2013-05-17-CallFrame.ll
@@ -1,10 +1,20 @@
-; RUN: llc -march=sparc < %s | FileCheck %s
+; RUN: llc -march=sparc   < %s | FileCheck %s --check-prefix=V8
+; RUN: llc -march=sparcv9 < %s | FileCheck %s --check-prefix=SPARC64
+
+; V8-LABEL: variable_alloca_with_adj_call_stack
+; V8:       save %sp, -96, %sp
+; V8:       add {{.+}}, 96, %o0
+; V8:       add %sp, -16, %sp
+; V8:       call foo
+; V8:       add %sp, 16, %sp
+
+; SPARC64-LABEL: variable_alloca_with_adj_call_stack
+; SPARC64:       save %sp, -128, %sp
+; SPARC64:       add {{.+}}, 2175, %o0
+; SPARC64:       add %sp, -80, %sp
+; SPARC64:       call foo
+; SPARC64:       add %sp, 80, %sp
 
-; CHECK: variable_alloca_with_adj_call_stack
-; CHECK: save %sp, -96, %sp
-; CHECK: add %sp, -16, %sp
-; CHECK: call foo
-; CHECK: add %sp, 16, %sp
 define void @variable_alloca_with_adj_call_stack(i32 %num) {
 entry:
   %0 = alloca i8, i32 %num, align 8
diff --git a/test/CodeGen/SPARC/64abi.ll b/test/CodeGen/SPARC/64abi.ll
index 5a7eb40..8b752a1 100644
--- a/test/CodeGen/SPARC/64abi.ll
+++ b/test/CodeGen/SPARC/64abi.ll
@@ -376,3 +376,38 @@ define signext i32 @ret_nosext(i32 signext %a0) {
 define signext i32 @ret_nozext(i32 signext %a0) {
   ret i32 %a0
 }
+
+; CHECK-LABEL: test_register_directive
+; CHECK:       .register %g2, #scratch
+; CHECK:       .register %g3, #scratch
+; CHECK:       add %i0, 2, %g2
+; CHECK:       add %i0, 3, %g3
+define i32 @test_register_directive(i32 %i0) {
+entry:
+  %0 = add nsw i32 %i0, 2
+  %1 = add nsw i32 %i0, 3
+  tail call void asm sideeffect "", "r,r,~{l0},~{l1},~{l2},~{l3},~{l4},~{l5},~{l6},~{l7},~{i0},~{i1},~{i2},~{i3},~{i4},~{i5},~{i6},~{i7},~{o0},~{o1},~{o2},~{o3},~{o4},~{o5},~{o6},~{o7},~{g1},~{g4},~{g5},~{g6},~{g7}"(i32 %0, i32 %1)
+  %2 = add nsw i32 %0, %1
+  ret i32 %2
+}
+
+; CHECK-LABEL: test_large_stack
+
+; CHECK:       sethi 16, %g1
+; CHECK:       xor %g1, -176, %g1
+; CHECK:       save %sp, %g1, %sp
+
+; CHECK:       sethi 14, %g1
+; CHECK:       xor %g1, -1, %g1
+; CHECK:       add %g1, %fp, %g1
+; CHECK:       call use_buf
+
+define i32 @test_large_stack() {
+entry:
+  %buffer1 = alloca [16384 x i8], align 8
+  %buffer1.sub = getelementptr inbounds [16384 x i8]* %buffer1, i32 0, i32 0
+  %0 = call i32 @use_buf(i32 16384, i8* %buffer1.sub)
+  ret i32 %0
+}
+
+declare i32 @use_buf(i32, i8*)
diff --git a/test/CodeGen/SPARC/64bit.ll b/test/CodeGen/SPARC/64bit.ll
index f778f9d..f5ed047 100644
--- a/test/CodeGen/SPARC/64bit.ll
+++ b/test/CodeGen/SPARC/64bit.ll
@@ -285,3 +285,26 @@ entry:
   store i64 0, i64* %0, align 8
   ret i64 0
 }
+
+; CHECK-LABEL: bit_ops
+; CHECK:       popc
+
+; OPT-LABEL: bit_ops
+; OPT:       popc
+
+define i64 @bit_ops(i64 %arg) {
+entry:
+  %0 = tail call i64 @llvm.ctpop.i64(i64 %arg)
+  %1 = tail call i64 @llvm.ctlz.i64(i64 %arg, i1 true)
+  %2 = tail call i64 @llvm.cttz.i64(i64 %arg, i1 true)
+  %3 = tail call i64 @llvm.bswap.i64(i64 %arg)
+  %4 = add i64 %0, %1
+  %5 = add i64 %2, %3
+  %6 = add i64 %4, %5
+  ret i64 %6
+}
+
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
+declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
diff --git a/test/CodeGen/SPARC/64cond.ll b/test/CodeGen/SPARC/64cond.ll
index bdc5e70..7451b04 100644
--- a/test/CodeGen/SPARC/64cond.ll
+++ b/test/CodeGen/SPARC/64cond.ll
@@ -109,3 +109,17 @@ entry:
   %rv = select i1 %tobool, i64 123, i64 0
   ret i64 %rv
 }
+
+; CHECK-LABEL: setcc_resultty
+; CHECK:       cmp
+; CHECK:       movne %xcc, 1, [[R:%[gilo][0-7]]]
+; CHECK:       or [[R]], %i1, %i0
+
+define i1 @setcc_resultty(i64 %a, i1 %b) {
+  %a0 = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a, i64 32)
+  %a1 = extractvalue { i64, i1 } %a0, 1
+  %a4 = or i1 %a1, %b
+  ret i1 %a4
+}
+
+declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
diff --git a/test/CodeGen/SPARC/constpool.ll b/test/CodeGen/SPARC/constpool.ll
index d8b7b15..b861676 100644
--- a/test/CodeGen/SPARC/constpool.ll
+++ b/test/CodeGen/SPARC/constpool.ll
@@ -39,8 +39,10 @@ entry:
 ; v8pic32: sethi %hi(.LCPI0_0), %[[R1:[gilo][0-7]]]
 ; v8pic32: add %[[R1]], %lo(.LCPI0_0), %[[Goffs:[gilo][0-7]]]
 ; v8pic32: ld [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
-; v8pic32: jmp %o7+8
 ; v8pic32: ld [%[[Gaddr]]], %f0
+; v8pic32: jmp %i7+8
+; v8pic32: restore
+
 
 
 ; v9pic32: floatCP
@@ -48,6 +50,8 @@ entry:
 ; v9pic32: sethi %hi(.LCPI0_0), %[[R1:[gilo][0-7]]]
 ; v9pic32: add %[[R1]], %lo(.LCPI0_0), %[[Goffs:[gilo][0-7]]]
 ; v9pic32: ldx [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
-; v9pic32: jmp %o7+8
 ; v9pic32: ld [%[[Gaddr]]], %f1
+; v9pic32: jmp %i7+8
+; v9pic32: restore
+
 
diff --git a/test/CodeGen/SPARC/exception.ll b/test/CodeGen/SPARC/exception.ll
new file mode 100644
index 0000000..cb5b6e5
--- /dev/null
+++ b/test/CodeGen/SPARC/exception.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -march=sparc | FileCheck %s
+
+
+%struct.__fundamental_type_info_pseudo = type { %struct.__type_info_pseudo }
+%struct.__type_info_pseudo = type { i8*, i8* }
+
+@_ZTIi = external constant %struct.__fundamental_type_info_pseudo
+@_ZTIf = external constant %struct.__fundamental_type_info_pseudo
+@.cst = linker_private unnamed_addr constant [12 x i8] c"catched int\00", align 64
+@.cst1 = linker_private unnamed_addr constant [14 x i8] c"catched float\00", align 64
+
+; CHECK-LABEL: main:
+; CHECK:       .cfi_startproc
+; CHECK:       .cfi_def_cfa_register 30
+; CHECK:       .cfi_window_save
+; CHECK:       .cfi_register 15, 31
+
+; CHECK:        call __cxa_throw
+; CHECK:        call __cxa_throw
+
+; CHECK:        call __cxa_begin_catch
+; CHECK:        call __cxa_end_catch
+
+; CHECK:        call __cxa_begin_catch
+; CHECK:        call __cxa_end_catch
+
+; CHECK:        .cfi_endproc
+
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) unnamed_addr #0 {
+entry:
+  %0 = icmp eq i32 %argc, 2
+  %1 = tail call i8* @__cxa_allocate_exception(i32 4) #1
+  br i1 %0, label %"3", label %"4"
+
+"3":                                              ; preds = %entry
+  %2 = bitcast i8* %1 to i32*
+  store i32 0, i32* %2, align 4
+  invoke void @__cxa_throw(i8* %1, i8* bitcast (%struct.__fundamental_type_info_pseudo* @_ZTIi to i8*), void (i8*)* null) #2
+          to label %3 unwind label %"8"
+
+; <label>:3                                       ; preds = %"3"
+  unreachable
+
+"4":                                              ; preds = %entry
+  %4 = bitcast i8* %1 to float*
+  store float 1.000000e+00, float* %4, align 4
+
+
+  invoke void @__cxa_throw(i8* %1, i8* bitcast (%struct.__fundamental_type_info_pseudo* @_ZTIf to i8*), void (i8*)* null) #2
+          to label %5 unwind label %"8"
+
+; <label>:5                                       ; preds = %"4"
+  unreachable
+
+"5":                                              ; preds = %"13", %"11"
+  %6 = phi i32 [ 2, %"13" ], [ 0, %"11" ]
+  ret i32 %6
+
+"8":                                              ; preds = %"4", %"3"
+  %exc = landingpad { i8*, i32 } personality i32 (i32, i64, i8*, i8*)* @__gxx_personality_v0
+          catch %struct.__fundamental_type_info_pseudo* @_ZTIi
+          catch %struct.__fundamental_type_info_pseudo* @_ZTIf
+  %exc_ptr12 = extractvalue { i8*, i32 } %exc, 0
+  %filter13 = extractvalue { i8*, i32 } %exc, 1
+  %typeid = tail call i32 @llvm.eh.typeid.for(i8* bitcast (%struct.__fundamental_type_info_pseudo* @_ZTIi to i8*))
+  %7 = icmp eq i32 %filter13, %typeid
+  br i1 %7, label %"11", label %8
+
+; <label>:8                                       ; preds = %"8"
+  %typeid8 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (%struct.__fundamental_type_info_pseudo* @_ZTIf to i8*))
+  %9 = icmp eq i32 %filter13, %typeid8
+  br i1 %9, label %"13", label %"9"
+
+"9":                                              ; preds = %8
+  resume { i8*, i32 } %exc
+
+"11":                                             ; preds = %"8"
+  %10 = tail call i8* @__cxa_begin_catch(i8* %exc_ptr12) #1
+  %11 = tail call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.cst, i32 0, i32 0))
+  tail call void @__cxa_end_catch() #1
+  br label %"5"
+
+"13":                                             ; preds = %8
+  %12 = tail call i8* @__cxa_begin_catch(i8* %exc_ptr12) #1
+  %13 = tail call i32 @puts(i8* getelementptr inbounds ([14 x i8]* @.cst1, i32 0, i32 0))
+  tail call void @__cxa_end_catch() #1
+  br label %"5"
+}
+
+; Function Attrs: nounwind
+declare i8* @__cxa_allocate_exception(i32) #1
+
+; Function Attrs: noreturn
+declare void @__cxa_throw(i8*, i8*, void (i8*)*) #2
+
+declare void @__cxa_end_catch()
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.eh.typeid.for(i8*) #3
+
+; Function Attrs: nounwind
+declare i8* @__cxa_begin_catch(i8*) #1
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #1
+
+declare i32 @__gxx_personality_v0(i32, i64, i8*, i8*)
+
+attributes #0 = { "no-frame-pointer-elim-non-leaf"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { noreturn }
+attributes #3 = { nounwind readnone }
diff --git a/test/CodeGen/SPARC/float.ll b/test/CodeGen/SPARC/float.ll
index 8dfd371..6636704 100644
--- a/test/CodeGen/SPARC/float.ll
+++ b/test/CodeGen/SPARC/float.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=sparc < %s | FileCheck %s -check-prefix=V8
 ; RUN: llc -march=sparc -O0 < %s | FileCheck %s -check-prefix=V8-UNOPT
 ; RUN: llc -march=sparc -mattr=v9 < %s | FileCheck %s -check-prefix=V9
-
+; RUN: llc -mtriple=sparc64-unknown-linux < %s | FileCheck %s -check-prefix=SPARC64
 
 ; V8-LABEL:     test_neg:
 ; V8:     call get_double
@@ -16,6 +16,9 @@
 ; V9-LABEL:     test_neg:
 ; V9:     fnegd %f0, %f0
 
+; SPARC64-LABEL: test_neg:
+; SPARC64:       fnegd %f0, %f0
+
 define double @test_neg() {
 entry:
   %0 = tail call double @get_double()
@@ -35,6 +38,10 @@ entry:
 ; V9-LABEL:     test_abs:
 ; V9:     fabsd %f0, %f0
 
+
+; SPARC64-LABEL:     test_abs:
+; SPARC64:     fabsd %f0, %f0
+
 define double @test_abs() {
 entry:
   %0 = tail call double @get_double()
@@ -45,3 +52,198 @@ entry:
 declare double @get_double()
 declare double @llvm.fabs.f64(double) nounwind readonly
 
+; V8-LABEL:    test_v9_floatreg:
+; V8:          fsubd {{.+}}, {{.+}}, {{.+}}
+; V8:          faddd {{.+}}, {{.+}}, [[R:%f(((1|2)?(0|2|4|6|8))|30)]]
+; V8:          std [[R]], [%{{.+}}]
+; V8:          ldd [%{{.+}}], %f0
+
+; V9-LABEL:    test_v9_floatreg:
+; V9:          fsubd {{.+}}, {{.+}}, {{.+}}
+; V9:          faddd {{.+}}, {{.+}}, [[R:%f((3(2|4|6|8))|((4|5)(0|2|4|6|8))|(60|62))]]
+; V9:          fmovd [[R]], %f0
+
+; SPARC64-LABEL:    test_v9_floatreg:
+; SPARC64:          fsubd {{.+}}, {{.+}}, {{.+}}
+; SPARC64:          faddd {{.+}}, {{.+}}, [[R:%f((3(2|4|6|8))|((4|5)(0|2|4|6|8))|(60|62))]]
+; SPARC64:          fmovd [[R]], %f0
+
+define double @test_v9_floatreg() {
+entry:
+  %0 = tail call double @get_double()
+  %1 = tail call double @get_double()
+  %2 = fsub double %0, %1
+  tail call void asm sideeffect "", "~{f0},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"()
+  %3 = fadd double %2, %2
+  ret double %3
+}
+
+; V8-LABEL:    test_xtos_stox
+; V8:          call __floatdisf
+; V8:          call __fixsfdi
+
+; V9-LABEL:    test_xtos_stox
+; V9:          call __floatdisf
+; V9:          call __fixsfdi
+
+; SPARC64-LABEL:    test_xtos_stox
+; SPARC64:          fxtos
+; SPARC64:          fstox
+
+define void @test_xtos_stox(i64 %a, i64* %ptr0, float* %ptr1) {
+entry:
+  %0 = sitofp i64 %a to float
+  store float %0, float* %ptr1, align 8
+  %1 = fptosi float %0 to i64
+  store i64 %1, i64* %ptr0, align 8
+  ret void
+}
+
+; V8-LABEL:    test_itos_stoi
+; V8:          fitos
+; V8:          fstoi
+
+; V9-LABEL:    test_itos_stoi
+; V9:          fitos
+; V9:          fstoi
+
+; SPARC64-LABEL:    test_itos_stoi
+; SPARC64:          fitos
+; SPARC64:          fstoi
+
+define void @test_itos_stoi(i32 %a, i32* %ptr0, float* %ptr1) {
+entry:
+  %0 = sitofp i32 %a to float
+  store float %0, float* %ptr1, align 8
+  %1 = fptosi float %0 to i32
+  store i32 %1, i32* %ptr0, align 8
+  ret void
+}
+
+
+; V8-LABEL:    test_xtod_dtox
+; V8:          call __floatdidf
+; V8:          call __fixdfdi
+
+; V9-LABEL:    test_xtod_dtox
+; V9:          call __floatdidf
+; V9:          call __fixdfdi
+
+; SPARC64-LABEL:    test_xtod_dtox
+; SPARC64:          fxtod
+; SPARC64:          fdtox
+
+define void @test_xtod_dtox(i64 %a, i64* %ptr0, double* %ptr1) {
+entry:
+  %0 = sitofp i64 %a to double
+  store double %0, double* %ptr1, align 8
+  %1 = fptosi double %0 to i64
+  store i64 %1, i64* %ptr0, align 8
+  ret void
+}
+
+; V8-LABEL:    test_itod_dtoi
+; V8:          fitod
+; V8:          fdtoi
+
+; V9-LABEL:    test_itod_dtoi
+; V9:          fitod
+; V9:          fdtoi
+
+; SPARC64-LABEL:    test_itod_dtoi
+; SPARC64:          fitod
+; SPARC64:          fdtoi
+
+define void @test_itod_dtoi(i32 %a, i32* %ptr0, double* %ptr1) {
+entry:
+  %0 = sitofp i32 %a to double
+  store double %0, double* %ptr1, align 8
+  %1 = fptosi double %0 to i32
+  store i32 %1, i32* %ptr0, align 8
+  ret void
+}
+
+; V8-LABEL:    test_uxtos_stoux
+; V8:          call __floatundisf
+; V8:          call __fixunssfdi
+
+; V9-LABEL:    test_uxtos_stoux
+; V9:          call __floatundisf
+; V9:          call __fixunssfdi
+
+; SPARC64-LABEL:   test_uxtos_stoux
+; SPARC64-NOT:     call __floatundisf
+; SPARC64-NOT:     call __fixunssfdi
+
+define void @test_uxtos_stoux(i64 %a, i64* %ptr0, float* %ptr1) {
+entry:
+  %0 = uitofp i64 %a to float
+  store float %0, float* %ptr1, align 8
+  %1 = fptoui float %0 to i64
+  store i64 %1, i64* %ptr0, align 8
+  ret void
+}
+
+; V8-LABEL:    test_utos_stou
+; V8:          fdtos
+; V8:          fstoi
+
+; V9-LABEL:    test_utos_stou
+; V9:          fdtos
+; V9:          fstoi
+
+; SPARC64-LABEL:    test_utos_stou
+; SPARC64:     fdtos
+; SPARC64:     fstoi
+
+define void @test_utos_stou(i32 %a, i32* %ptr0, float* %ptr1) {
+entry:
+  %0 = uitofp i32 %a to float
+  store float %0, float* %ptr1, align 8
+  %1 = fptoui float %0 to i32
+  store i32 %1, i32* %ptr0, align 8
+  ret void
+}
+
+
+; V8-LABEL:    test_uxtod_dtoux
+; V8:          call __floatundidf
+; V8:          call __fixunsdfdi
+
+; V9-LABEL:    test_uxtod_dtoux
+; V9:          call __floatundidf
+; V9:          call __fixunsdfdi
+
+; SPARC64-LABEL:    test_uxtod_dtoux
+; SPARC64-NOT:          call __floatundidf
+; SPARC64-NOT:          call __floatunsdfdi
+
+define void @test_uxtod_dtoux(i64 %a, i64* %ptr0, double* %ptr1) {
+entry:
+  %0 = uitofp i64 %a to double
+  store double %0, double* %ptr1, align 8
+  %1 = fptoui double %0 to i64
+  store i64 %1, i64* %ptr0, align 8
+  ret void
+}
+
+; V8-LABEL:    test_utod_dtou
+; V8-NOT:      fitod
+; V8:          fdtoi
+
+; V9-LABEL:    test_utod_dtou
+; V9-NOT:      fitod
+; V9:          fdtoi
+
+; SPARC64-LABEL:    test_utod_dtou
+; SPARC64-NOT:      fitod
+; SPARC64:          fdtoi
+
+define void @test_utod_dtou(i32 %a, double %b, i32* %ptr0, double* %ptr1) {
+entry:
+  %0 = uitofp i32 %a to double
+  store double %0, double* %ptr1, align 8
+  %1 = fptoui double %b to i32
+  store i32 %1, i32* %ptr0, align 8
+  ret void
+}
diff --git a/test/CodeGen/SPARC/fp128.ll b/test/CodeGen/SPARC/fp128.ll
new file mode 100644
index 0000000..c761361
--- /dev/null
+++ b/test/CodeGen/SPARC/fp128.ll
@@ -0,0 +1,234 @@
+; RUN: llc < %s -march=sparc -mattr=hard-quad-float | FileCheck %s --check-prefix=HARD
+; RUN: llc < %s -march=sparc -mattr=-hard-quad-float | FileCheck %s --check-prefix=SOFT
+
+
+; HARD-LABEL: f128_ops
+; HARD:       ldd
+; HARD:       ldd
+; HARD:       ldd
+; HARD:       ldd
+; HARD:       faddq [[R0:.+]],  [[R1:.+]],  [[R2:.+]]
+; HARD:       fsubq [[R2]], [[R3:.+]], [[R4:.+]]
+; HARD:       fmulq [[R4]], [[R5:.+]], [[R6:.+]]
+; HARD:       fdivq [[R6]], [[R2]]
+; HARD:       std
+; HARD:       std
+
+; SOFT-LABEL: f128_ops
+; SOFT:       ldd
+; SOFT:       ldd
+; SOFT:       ldd
+; SOFT:       ldd
+; SOFT:       call _Q_add
+; SOFT:       call _Q_sub
+; SOFT:       call _Q_mul
+; SOFT:       call _Q_div
+; SOFT:       std
+; SOFT:       std
+
+define void @f128_ops(fp128* noalias sret %scalar.result, fp128* byval %a, fp128* byval %b, fp128* byval %c, fp128* byval %d) {
+entry:
+  %0 = load fp128* %a, align 8
+  %1 = load fp128* %b, align 8
+  %2 = load fp128* %c, align 8
+  %3 = load fp128* %d, align 8
+  %4 = fadd fp128 %0, %1
+  %5 = fsub fp128 %4, %2
+  %6 = fmul fp128 %5, %3
+  %7 = fdiv fp128 %6, %4
+  store fp128 %7, fp128* %scalar.result, align 8
+  ret void
+}
+
+; HARD-LABEL: f128_spill
+; HARD:       std %f{{.+}}, [%[[S0:.+]]]
+; HARD:       std %f{{.+}}, [%[[S1:.+]]]
+; HARD-DAG:   ldd [%[[S0]]], %f{{.+}}
+; HARD-DAG:   ldd [%[[S1]]], %f{{.+}}
+; HARD:       jmp
+
+; SOFT-LABEL: f128_spill
+; SOFT:       std %f{{.+}}, [%[[S0:.+]]]
+; SOFT:       std %f{{.+}}, [%[[S1:.+]]]
+; SOFT-DAG:   ldd [%[[S0]]], %f{{.+}}
+; SOFT-DAG:   ldd [%[[S1]]], %f{{.+}}
+; SOFT:       jmp
+
+define void @f128_spill(fp128* noalias sret %scalar.result, fp128* byval %a) {
+entry:
+  %0 = load fp128* %a, align 8
+  call void asm sideeffect "", "~{f0},~{f1},~{f2},~{f3},~{f4},~{f5},~{f6},~{f7},~{f8},~{f9},~{f10},~{f11},~{f12},~{f13},~{f14},~{f15},~{f16},~{f17},~{f18},~{f19},~{f20},~{f21},~{f22},~{f23},~{f24},~{f25},~{f26},~{f27},~{f28},~{f29},~{f30},~{f31}"()
+  store fp128 %0, fp128* %scalar.result, align 8
+  ret void
+}
+
+; HARD-LABEL: f128_compare
+; HARD:       fcmpq
+; HARD-NEXT:  nop
+
+; SOFT-LABEL: f128_compare
+; SOFT:       _Q_cmp
+
+define i32 @f128_compare(fp128* byval %f0, fp128* byval %f1, i32 %a, i32 %b) {
+entry:
+   %0 = load fp128* %f0, align 8
+   %1 = load fp128* %f1, align 8
+   %cond = fcmp ult fp128 %0, %1
+   %ret = select i1 %cond, i32 %a, i32 %b
+   ret i32 %ret
+}
+
+; HARD-LABEL: f128_compare2
+; HARD:       fcmpq
+; HARD:       fb{{ule|g}}
+
+; SOFT-LABEL: f128_compare2
+; SOFT:       _Q_cmp
+; SOFT:       cmp
+
+define i32 @f128_compare2() {
+entry:
+  %0 = fcmp ogt fp128 undef, 0xL00000000000000000000000000000000
+  br i1 %0, label %"5", label %"7"
+
+"5":                                              ; preds = %entry
+  ret i32 0
+
+"7":                                              ; preds = %entry
+  ret i32 1
+}
+
+
+; HARD-LABEL: f128_abs
+; HARD:       fabss
+
+; SOFT-LABEL: f128_abs
+; SOFT:       fabss
+
+define void @f128_abs(fp128* noalias sret %scalar.result, fp128* byval %a) {
+entry:
+  %0 = load fp128* %a, align 8
+  %1 = tail call fp128 @llvm.fabs.f128(fp128 %0)
+  store fp128 %1, fp128* %scalar.result, align 8
+  ret void
+}
+
+declare fp128 @llvm.fabs.f128(fp128) nounwind readonly
+
+; HARD-LABEL: int_to_f128
+; HARD:       fitoq
+
+; SOFT-LABEL: int_to_f128
+; SOFT:       _Q_itoq
+
+define void @int_to_f128(fp128* noalias sret %scalar.result, i32 %i) {
+entry:
+  %0 = sitofp i32 %i to fp128
+  store fp128 %0, fp128* %scalar.result, align 8
+  ret void
+}
+
+; HARD-LABEL: fp128_unaligned
+; HARD:       ldub
+; HARD:       faddq
+; HARD:       stb
+; HARD:       jmp
+
+; SOFT-LABEL: fp128_unaligned
+; SOFT:       ldub
+; SOFT:       call _Q_add
+; SOFT:       stb
+; SOFT:       jmp
+
+define void @fp128_unaligned(fp128* %a, fp128* %b, fp128* %c) {
+entry:
+  %0 = load fp128* %a, align 1
+  %1 = load fp128* %b, align 1
+  %2 = fadd fp128 %0, %1
+  store fp128 %2, fp128* %c, align 1
+  ret void
+}
+
+; HARD-LABEL: uint_to_f128
+; HARD:       fdtoq
+
+; SOFT-LABEL: uint_to_f128
+; SOFT:       _Q_utoq
+
+define void @uint_to_f128(fp128* noalias sret %scalar.result, i32 %i) {
+entry:
+  %0 = uitofp i32 %i to fp128
+  store fp128 %0, fp128* %scalar.result, align 8
+  ret void
+}
+
+; HARD-LABEL: f128_to_i32
+; HARD:       fqtoi
+; HARD:       fqtoi
+
+; SOFT-LABEL: f128_to_i32
+; SOFT:       call _Q_qtou
+; SOFT:       call _Q_qtoi
+
+
+define i32 @f128_to_i32(fp128* %a, fp128* %b) {
+entry:
+  %0 = load fp128* %a, align 8
+  %1 = load fp128* %b, align 8
+  %2 = fptoui fp128 %0 to i32
+  %3 = fptosi fp128 %1 to i32
+  %4 = add i32 %2, %3
+  ret i32 %4
+}
+
+; HARD-LABEL:    test_itoq_qtoi
+; HARD:          call _Q_lltoq
+; HARD:          call _Q_qtoll
+; HARD:          fitoq
+; HARD:          fqtoi
+
+; SOFT-LABEL:    test_itoq_qtoi
+; SOFT:          call _Q_lltoq
+; SOFT:          call _Q_qtoll
+; SOFT:          call _Q_itoq
+; SOFT:          call _Q_qtoi
+
+define void @test_itoq_qtoi(i64 %a, i32 %b, i64* %ptr0, fp128* %ptr1) {
+entry:
+  %0 = sitofp i64 %a to fp128
+  store  fp128 %0, fp128* %ptr1, align 8
+  %1 = fptosi fp128 %0 to i64
+  store  i64 %1, i64* %ptr0, align 8
+  %2 = sitofp i32 %b to fp128
+  store  fp128 %2, fp128* %ptr1, align 8
+  %3 = fptosi fp128 %2 to i32
+  %4 = bitcast i64* %ptr0 to i32*
+  store  i32 %3, i32* %4, align 8
+  ret void
+}
+
+; HARD-LABEL:    test_utoq_qtou
+; HARD-DAG:      call _Q_ulltoq
+; HARD-DAG:      call _Q_qtoull
+; HARD-DAG:      fdtoq
+; HARD-DAG:      fqtoi
+
+; SOFT-LABEL:    test_utoq_qtou
+; SOFT-DAG:      call _Q_ulltoq
+; SOFT-DAG:      call _Q_qtoull
+; SOFT-DAG:      call _Q_utoq
+; SOFT-DAG:      call _Q_qtou
+
+define void @test_utoq_qtou(i64 %a, i32 %b, i64* %ptr0, fp128* %ptr1) {
+entry:
+  %0 = uitofp i64 %a to fp128
+  store  fp128 %0, fp128* %ptr1, align 8
+  %1 = fptoui fp128 %0 to i64
+  store  i64 %1, i64* %ptr0, align 8
+  %2 = uitofp i32 %b to fp128
+  store  fp128 %2, fp128* %ptr1, align 8
+  %3 = fptoui fp128 %2 to i32
+  %4 = bitcast i64* %ptr0 to i32*
+  store  i32 %3, i32* %4, align 8
+  ret void
+}
diff --git a/test/CodeGen/SPARC/globals.ll b/test/CodeGen/SPARC/globals.ll
index 0e0dfc8..7e3effe 100644
--- a/test/CodeGen/SPARC/globals.ll
+++ b/test/CodeGen/SPARC/globals.ll
@@ -41,8 +41,9 @@ define zeroext i8 @loadG() {
 ; v8pic32: sethi %hi(G), %[[R1:[gilo][0-7]]]
 ; v8pic32: add %[[R1]], %lo(G), %[[Goffs:[gilo][0-7]]]
 ; v8pic32: ld [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
-; v8pic32: jmp %o7+8
-; v8pic32: ldub [%[[Gaddr]]], %o0
+; v8pic32: ldub [%[[Gaddr]]], %i0
+; v8pic32: jmp %i7+8
+; v8pic32: restore
 
 
 ; v9pic32: loadG
@@ -50,6 +51,7 @@ define zeroext i8 @loadG() {
 ; v9pic32: sethi %hi(G), %[[R1:[gilo][0-7]]]
 ; v9pic32: add %[[R1]], %lo(G), %[[Goffs:[gilo][0-7]]]
 ; v9pic32: ldx [%[[GOT:[gilo][0-7]]]+%[[Goffs]]], %[[Gaddr:[gilo][0-7]]]
-; v9pic32: jmp %o7+8
-; v9pic32: ldub [%[[Gaddr]]], %o0
+; v9pic32: ldub [%[[Gaddr]]], %i0
+; v9pic32: jmp %i7+8
+; v9pic32: restore
 
diff --git a/test/CodeGen/SPARC/lit.local.cfg b/test/CodeGen/SPARC/lit.local.cfg
index 6f30a87..4d344fa 100644
--- a/test/CodeGen/SPARC/lit.local.cfg
+++ b/test/CodeGen/SPARC/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'Sparc' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/SPARC/rem.ll b/test/CodeGen/SPARC/rem.ll
new file mode 100644
index 0000000..abef1fc
--- /dev/null
+++ b/test/CodeGen/SPARC/rem.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -march=sparcv9 | FileCheck %s
+
+; CHECK-LABEL: test1:
+; CHECK:        sdivx %o0, %o1, %o2
+; CHECK-NEXT:   mulx %o2, %o1, %o1
+; CHECK-NEXT:   jmp %o7+8
+; CHECK-NEXT:   sub %o0, %o1, %o0
+
+define i64 @test1(i64 %X, i64 %Y) {
+        %tmp1 = srem i64 %X, %Y
+        ret i64 %tmp1
+}
+
+; CHECK-LABEL: test2:
+; CHECK:        udivx %o0, %o1, %o2
+; CHECK-NEXT:   mulx %o2, %o1, %o1
+; CHECK-NEXT:   jmp %o7+8
+; CHECK-NEXT:   sub %o0, %o1, %o0
+
+define i64 @test2(i64 %X, i64 %Y) {
+        %tmp1 = urem i64 %X, %Y
+        ret i64 %tmp1
+}
+
+; PR18150
+; CHECK-LABEL: test3
+; CHECK:       sethi 2545, [[R0:%[gilo][0-7]]]
+; CHECK:       or    [[R0]], 379, [[R1:%[gilo][0-7]]]
+; CHECK:       mulx  %o0, [[R1]], [[R2:%[gilo][0-7]]]
+; CHECK:       udivx [[R2]], 1021, [[R3:%[gilo][0-7]]]
+; CHECK:       mulx  [[R3]], 1021, [[R4:%[gilo][0-7]]]
+; CHECK:       sub   [[R2]], [[R4]], %o0
+
+define i64 @test3(i64 %b) {
+entry:
+  %mul = mul i64 %b, 2606459
+  %rem = urem i64 %mul, 1021
+  ret i64 %rem
+}
diff --git a/test/CodeGen/SPARC/setjmp.ll b/test/CodeGen/SPARC/setjmp.ll
new file mode 100644
index 0000000..39984fb
--- /dev/null
+++ b/test/CodeGen/SPARC/setjmp.ll
@@ -0,0 +1,72 @@
+;RUN: llc -march=sparc   < %s | FileCheck %s
+;RUN: llc -march=sparcv9 < %s | FileCheck %s --check-prefix=V9
+
+
+%0 = type { [32 x i32] }
+%struct.jmpbuf_env = type { i32, i32, [1 x %struct.__jmp_buf_tag], i32 }
+%struct.__jmp_buf_tag = type { [3 x i32], i32, %0 }
+
+@jenv = common unnamed_addr global %struct.jmpbuf_env* null
+@.cst = linker_private unnamed_addr constant [30 x i8] c"in bar with jmp_buf's id: %d\0A\00", align 64
+
+; CHECK-LABEL: foo
+; CHECK-DAG:   st {{.+}}, [%i0]
+; CHECK-DAG:   st {{.+}}, [%i0+4]
+; CHECK:       call _setjmp
+; CHECK:       ld [%fp+{{.+}}], %[[R:[gilo][0-7]]]
+; CHECK:       st %o0, [%[[R]]+{{.+}}]
+
+; V9-LABEL:   foo
+; V9-DAG:     st {{.+}}, [%i0]
+; V9-DAG:     st {{.+}}, [%i0+4]
+; V9:         call _setjmp
+; V9:         ldx [%fp+{{.+}}], %[[R:[gilo][0-7]]]
+; V9:         st %o0, [%[[R]]+{{.+}}]
+
+; Function Attrs: nounwind
+define i32 @foo(%struct.jmpbuf_env* byval %inbuf) #0 {
+entry:
+  %0 = getelementptr inbounds %struct.jmpbuf_env* %inbuf, i32 0, i32 0
+  store i32 0, i32* %0, align 4, !tbaa !4
+  %1 = getelementptr inbounds %struct.jmpbuf_env* %inbuf, i32 0, i32 1
+  store i32 1, i32* %1, align 4, !tbaa !4
+  %2 = getelementptr inbounds %struct.jmpbuf_env* %inbuf, i32 0, i32 2, i32 0
+  %3 = call i32 @_setjmp(%struct.__jmp_buf_tag* %2) #2
+  %4 = getelementptr inbounds %struct.jmpbuf_env* %inbuf, i32 0, i32 3
+  store i32 %3, i32* %4, align 4, !tbaa !4
+  store %struct.jmpbuf_env* %inbuf, %struct.jmpbuf_env** @jenv, align 4, !tbaa !3
+  %5 = load i32* %1, align 4, !tbaa !4
+  %6 = icmp eq i32 %5, 1
+  %7 = icmp eq i32 %3, 0
+  %or.cond = and i1 %6, %7
+  br i1 %or.cond, label %"4.i", label %bar.exit
+
+"4.i":                                            ; preds = %entry
+  call void @longjmp(%struct.__jmp_buf_tag* %2, i32 0) #1
+  unreachable
+
+bar.exit:                                         ; preds = %entry
+  %8 = load i32* %0, align 4, !tbaa !4
+  %9 = call i32 (i8*, ...)* @printf(i8* noalias getelementptr inbounds ([30 x i8]* @.cst, i32 0, i32 0), i32 %8) #0
+  ret i32 0
+}
+
+; Function Attrs: nounwind returns_twice
+declare i32 @_setjmp(%struct.__jmp_buf_tag*) #2
+
+; Function Attrs: noreturn nounwind
+declare void @longjmp(%struct.__jmp_buf_tag*, i32) #1
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture, ...) #0
+
+
+attributes #0 = { nounwind }
+attributes #1 = { noreturn nounwind }
+attributes #2 = { nounwind returns_twice }
+
+!0 = metadata !{metadata !"alias set 6: struct.jmpbuf_env*", metadata !1}
+!1 = metadata !{metadata !1}
+!2 = metadata !{metadata !"alias set 3: int", metadata !1}
+!3 = metadata !{metadata !0, metadata !0, i64 0}
+!4 = metadata !{metadata !2, metadata !2, i64 0}
diff --git a/test/CodeGen/SPARC/tls.ll b/test/CodeGen/SPARC/tls.ll
new file mode 100644
index 0000000..660ddff
--- /dev/null
+++ b/test/CodeGen/SPARC/tls.ll
@@ -0,0 +1,73 @@
+; RUN: llc <%s -march=sparc   -relocation-model=static | FileCheck %s --check-prefix=v8abs
+; RUN: llc <%s -march=sparcv9 -relocation-model=static | FileCheck %s --check-prefix=v9abs
+; RUN: llc <%s -march=sparc   -relocation-model=pic    | FileCheck %s --check-prefix=pic
+; RUN: llc <%s -march=sparcv9 -relocation-model=pic    | FileCheck %s --check-prefix=pic
+
+
+@local_symbol = internal thread_local global i32 0
+@extern_symbol = external thread_local global i32
+
+; v8abs-LABEL:  test_tls_local
+; v8abs:        sethi  %tle_hix22(local_symbol), [[R0:%[goli][0-7]]]
+; v8abs:        xor    [[R0]], %tle_lox10(local_symbol), [[R1:%[goli][0-7]]]
+; v8abs:        ld     [%g7+[[R1]]]
+
+; v9abs-LABEL:  test_tls_local
+; v9abs:        sethi  %tle_hix22(local_symbol), [[R0:%[goli][0-7]]]
+; v9abs:        xor    [[R0]], %tle_lox10(local_symbol), [[R1:%[goli][0-7]]]
+; v9abs:        ld     [%g7+[[R1]]]
+
+; pic-LABEL:  test_tls_local
+; pic:        or     {{%[goli][0-7]}}, %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[PC:%[goli][0-7]]]
+; pic:        add    [[PC]], %o7, [[GOTBASE:%[goli][0-7]]]
+; pic-DAG:    sethi  %tldm_hi22(local_symbol), [[R0:%[goli][0-7]]]
+; pic-DAG:    add    [[R0]], %tldm_lo10(local_symbol), [[R1:%[goli][0-7]]]
+; pic-DAG:    add    [[GOTBASE]], [[R1]], %o0, %tldm_add(local_symbol)
+; pic-DAG:    call   __tls_get_addr, %tldm_call(local_symbol)
+; pic-DAG:    sethi  %tldo_hix22(local_symbol), [[R2:%[goli][0-7]]]
+; pic-DAG:    xor    [[R2]], %tldo_lox10(local_symbol), [[R3:%[goli][0-7]]]
+; pic:        add    %o0, [[R3]], {{.+}}, %tldo_add(local_symbol)
+
+define i32 @test_tls_local() {
+entry:
+  %0 = load i32* @local_symbol, align 4
+  %1 = add i32 %0, 1
+  store i32 %1, i32* @local_symbol, align 4
+  ret i32 %1
+}
+
+
+; v8abs-LABEL:  test_tls_extern
+; v8abs:        or     {{%[goli][0-7]}}, %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[PC:%[goli][0-7]]]
+; v8abs:        add    [[PC]], %o7, %[[GOTBASE:[goli][0-7]]]
+; v8abs:        sethi  %tie_hi22(extern_symbol), [[R1:%[goli][0-7]]]
+; v8abs:        add    [[R1]], %tie_lo10(extern_symbol), %[[R2:[goli][0-7]]]
+; v8abs:        ld     [%[[GOTBASE]]+%[[R2]]], [[R3:%[goli][0-7]]], %tie_ld(extern_symbol)
+; v8abs:        add    %g7, [[R3]], %[[R4:[goli][0-7]]], %tie_add(extern_symbol)
+; v8abs:        ld     [%[[R4]]]
+
+; v9abs-LABEL:  test_tls_extern
+; v9abs:        or     {{%[goli][0-7]}}, %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[PC:%[goli][0-7]]]
+; v9abs:        add    [[PC]], %o7, %[[GOTBASE:[goli][0-7]]]
+; v9abs:        sethi  %tie_hi22(extern_symbol), [[R1:%[goli][0-7]]]
+; v9abs:        add    [[R1]], %tie_lo10(extern_symbol), %[[R2:[goli][0-7]]]
+; v9abs:        ldx    [%[[GOTBASE]]+%[[R2]]], [[R3:%[goli][0-7]]], %tie_ldx(extern_symbol)
+; v9abs:        add    %g7, [[R3]], %[[R4:[goli][0-7]]], %tie_add(extern_symbol)
+; v9abs:        ld     [%[[R4]]]
+
+; pic-LABEL:  test_tls_extern
+; pic:        or     {{%[goli][0-7]}}, %lo(_GLOBAL_OFFSET_TABLE_+{{.+}}), [[PC:%[goli][0-7]]]
+; pic:        add    [[PC]], %o7, [[GOTBASE:%[goli][0-7]]]
+; pic:        sethi  %tgd_hi22(extern_symbol), [[R0:%[goli][0-7]]]
+; pic:        add    [[R0]], %tgd_lo10(extern_symbol), [[R1:%[goli][0-7]]]
+; pic:        add    [[GOTBASE]], [[R1]], %o0, %tgd_add(extern_symbol)
+; pic:        call   __tls_get_addr, %tgd_call(extern_symbol)
+; pic-NEXT:   nop
+
+define i32 @test_tls_extern() {
+entry:
+  %0 = load i32* @extern_symbol, align 4
+  %1 = add i32 %0, 1
+  store i32 %1, i32* @extern_symbol, align 4
+  ret i32 %1
+}
diff --git a/test/CodeGen/SystemZ/Large/branch-range-09.py b/test/CodeGen/SystemZ/Large/branch-range-09.py
new file mode 100644
index 0000000..b3fd813
--- /dev/null
+++ b/test/CodeGen/SystemZ/Large/branch-range-09.py
@@ -0,0 +1,107 @@
+# Test 32-bit COMPARE LOGICAL AND BRANCH in cases where the sheer number of
+# instructions causes some branches to be out of range.
+# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
+
+# Construct:
+#
+# before0:
+#   conditional branch to after0
+#   ...
+# beforeN:
+#   conditional branch to after0
+# main:
+#   0xffcc bytes, from MVIY instructions
+#   conditional branch to main
+# after0:
+#   ...
+#   conditional branch to main
+# afterN:
+#
+# Each conditional branch sequence occupies 12 bytes if it uses a short
+# branch and 14 if it uses a long one.  The ones before "main:" have to
+# take the branch length into account, which is 6 for short branches,
+# so the final (0x34 - 6) / 12 == 3 blocks can use short branches.
+# The ones after "main:" do not, so the first 0x34 / 12 == 4 blocks
+# can use short branches.
+#
+# CHECK: lb [[REG:%r[0-5]]], 0(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL:\.L[^ ]*]]
+# CHECK: lb [[REG:%r[0-5]]], 1(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 2(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 3(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 4(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 5(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 6(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 7(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# ...main goes here...
+# CHECK: lb [[REG:%r[0-5]]], 25(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL:\.L[^ ]*]]
+# CHECK: lb [[REG:%r[0-5]]], 26(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 27(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 28(%r3)
+# CHECK: clrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 29(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 30(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 31(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lb [[REG:%r[0-5]]], 32(%r3)
+# CHECK: clr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+
+branch_blocks = 8
+main_size = 0xffcc
+
+print 'define void @f1(i8 *%base, i8 *%stop, i32 %limit) {'
+print 'entry:'
+print '  br label %before0'
+print ''
+
+for i in xrange(branch_blocks):
+    next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
+    print 'before%d:' % i
+    print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
+    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
+    print '  %%bext%d = sext i8 %%bcur%d to i32' % (i, i)
+    print '  %%btest%d = icmp ult i32 %%limit, %%bext%d' % (i, i)
+    print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
+    print ''
+
+print '%s:' % next
+a, b = 1, 1
+for i in xrange(0, main_size, 6):
+    a, b = b, a + b
+    offset = 4096 + b % 500000
+    value = a % 256
+    print '  %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
+    print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
+
+for i in xrange(branch_blocks):
+    print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
+    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
+    print '  %%aext%d = sext i8 %%acur%d to i32' % (i, i)
+    print '  %%atest%d = icmp ult i32 %%limit, %%aext%d' % (i, i)
+    print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
+    print ''
+    print 'after%d:' % i
+
+print '  ret void'
+print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-10.py b/test/CodeGen/SystemZ/Large/branch-range-10.py
new file mode 100644
index 0000000..3aeea3e
--- /dev/null
+++ b/test/CodeGen/SystemZ/Large/branch-range-10.py
@@ -0,0 +1,111 @@
+# Test 64-bit COMPARE LOGICAL AND BRANCH in cases where the sheer number of
+# instructions causes some branches to be out of range.
+# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
+
+# Construct:
+#
+# before0:
+#   conditional branch to after0
+#   ...
+# beforeN:
+#   conditional branch to after0
+# main:
+#   0xffcc bytes, from MVIY instructions
+#   conditional branch to main
+# after0:
+#   ...
+#   conditional branch to main
+# afterN:
+#
+# Each conditional branch sequence occupies 12 bytes if it uses a short
+# branch and 16 if it uses a long one.  The ones before "main:" have to
+# take the branch length into account, which is 6 for short branches,
+# so the final (0x34 - 6) / 12 == 3 blocks can use short branches.
+# The ones after "main:" do not, so the first 0x34 / 12 == 4 blocks
+# can use short branches.  The conservative algorithm we use makes
+# one of the forward branches unnecessarily long, as noted in the
+# check output below.
+#
+# CHECK: lgb [[REG:%r[0-5]]], 0(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL:\.L[^ ]*]]
+# CHECK: lgb [[REG:%r[0-5]]], 1(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 2(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 3(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 4(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# ...as mentioned above, the next one could be a CLGRJL instead...
+# CHECK: lgb [[REG:%r[0-5]]], 5(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 6(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 7(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL]]
+# ...main goes here...
+# CHECK: lgb [[REG:%r[0-5]]], 25(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL:\.L[^ ]*]]
+# CHECK: lgb [[REG:%r[0-5]]], 26(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 27(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 28(%r3)
+# CHECK: clgrjl %r4, [[REG]], [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 29(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 30(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 31(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+# CHECK: lgb [[REG:%r[0-5]]], 32(%r3)
+# CHECK: clgr %r4, [[REG]]
+# CHECK: jgl [[LABEL]]
+
+branch_blocks = 8
+main_size = 0xffcc
+
+print 'define void @f1(i8 *%base, i8 *%stop, i64 %limit) {'
+print 'entry:'
+print '  br label %before0'
+print ''
+
+for i in xrange(branch_blocks):
+    next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
+    print 'before%d:' % i
+    print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
+    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
+    print '  %%bext%d = sext i8 %%bcur%d to i64' % (i, i)
+    print '  %%btest%d = icmp ult i64 %%limit, %%bext%d' % (i, i)
+    print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
+    print ''
+
+print '%s:' % next
+a, b = 1, 1
+for i in xrange(0, main_size, 6):
+    a, b = b, a + b
+    offset = 4096 + b % 500000
+    value = a % 256
+    print '  %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
+    print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
+
+for i in xrange(branch_blocks):
+    print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
+    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
+    print '  %%aext%d = sext i8 %%acur%d to i64' % (i, i)
+    print '  %%atest%d = icmp ult i64 %%limit, %%aext%d' % (i, i)
+    print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
+    print ''
+    print 'after%d:' % i
+
+print '  ret void'
+print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-11.py b/test/CodeGen/SystemZ/Large/branch-range-11.py
new file mode 100644
index 0000000..034902c
--- /dev/null
+++ b/test/CodeGen/SystemZ/Large/branch-range-11.py
@@ -0,0 +1,127 @@
+# Test 32-bit COMPARE LOGICAL IMMEDIATE AND BRANCH in cases where the sheer
+# number of instructions causes some branches to be out of range.
+# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
+
+# Construct:
+#
+# before0:
+#   conditional branch to after0
+#   ...
+# beforeN:
+#   conditional branch to after0
+# main:
+#   0xffc6 bytes, from MVIY instructions
+#   conditional branch to main
+# after0:
+#   ...
+#   conditional branch to main
+# afterN:
+#
+# Each conditional branch sequence occupies 14 bytes if it uses a short
+# branch and 20 if it uses a long one.  The ones before "main:" have to
+# take the branch length into account, which is 6 for short branches,
+# so the final (0x3a - 6) / 14 == 3 blocks can use short branches.
+# The ones after "main:" do not, so the first 0x3a / 14 == 4 blocks
+# can use short branches.  The conservative algorithm we use makes
+# one of the forward branches unnecessarily long, as noted in the
+# check output below.
+#
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 50
+# CHECK: jgl [[LABEL:\.L[^ ]*]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 51
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 52
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 53
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 54
+# CHECK: jgl [[LABEL]]
+# ...as mentioned above, the next one could be a CLIJL instead...
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 55
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 56, [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 57, [[LABEL]]
+# ...main goes here...
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 100, [[LABEL:\.L[^ ]*]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 101, [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 102, [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clijl [[REG]], 103, [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 104
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 105
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 106
+# CHECK: jgl [[LABEL]]
+# CHECK: l [[REG:%r[0-5]]], 0(%r3)
+# CHECK: s [[REG]], 0(%r4)
+# CHECK: clfi [[REG]], 107
+# CHECK: jgl [[LABEL]]
+
+branch_blocks = 8
+main_size = 0xffc6
+
+print 'define void @f1(i8 *%base, i32 *%stopa, i32 *%stopb) {'
+print 'entry:'
+print '  br label %before0'
+print ''
+
+for i in xrange(branch_blocks):
+    next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
+    print 'before%d:' % i
+    print '  %%bcur%da = load volatile i32 *%%stopa' % i
+    print '  %%bcur%db = load volatile i32 *%%stopb' % i
+    print '  %%bsub%d = sub i32 %%bcur%da, %%bcur%db' % (i, i, i)
+    print '  %%btest%d = icmp ult i32 %%bsub%d, %d' % (i, i, i + 50)
+    print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
+    print ''
+
+print '%s:' % next
+a, b = 1, 1
+for i in xrange(0, main_size, 6):
+    a, b = b, a + b
+    offset = 4096 + b % 500000
+    value = a % 256
+    print '  %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
+    print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
+
+for i in xrange(branch_blocks):
+    print '  %%acur%da = load volatile i32 *%%stopa' % i
+    print '  %%acur%db = load volatile i32 *%%stopb' % i
+    print '  %%asub%d = sub i32 %%acur%da, %%acur%db' % (i, i, i)
+    print '  %%atest%d = icmp ult i32 %%asub%d, %d' % (i, i, i + 100)
+    print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
+    print ''
+    print 'after%d:' % i
+
+print '  ret void'
+print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-12.py b/test/CodeGen/SystemZ/Large/branch-range-12.py
new file mode 100644
index 0000000..007d477
--- /dev/null
+++ b/test/CodeGen/SystemZ/Large/branch-range-12.py
@@ -0,0 +1,127 @@
+# Test 64-bit COMPARE LOGICAL IMMEDIATE AND BRANCH in cases where the sheer
+# number of instructions causes some branches to be out of range.
+# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
+
+# Construct:
+#
+# before0:
+#   conditional branch to after0
+#   ...
+# beforeN:
+#   conditional branch to after0
+# main:
+#   0xffb4 bytes, from MVIY instructions
+#   conditional branch to main
+# after0:
+#   ...
+#   conditional branch to main
+# afterN:
+#
+# Each conditional branch sequence occupies 18 bytes if it uses a short
+# branch and 24 if it uses a long one.  The ones before "main:" have to
+# take the branch length into account, which is 6 for short branches,
+# so the final (0x4c - 6) / 18 == 3 blocks can use short branches.
+# The ones after "main:" do not, so the first 0x4c / 18 == 4 blocks
+# can use short branches.  The conservative algorithm we use makes
+# one of the forward branches unnecessarily long, as noted in the
+# check output below.
+#
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 50
+# CHECK: jgl [[LABEL:\.L[^ ]*]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 51
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 52
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 53
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 54
+# CHECK: jgl [[LABEL]]
+# ...as mentioned above, the next one could be a CLGIJL instead...
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 55
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 56, [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 57, [[LABEL]]
+# ...main goes here...
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 100, [[LABEL:\.L[^ ]*]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 101, [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 102, [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgijl [[REG]], 103, [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 104
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 105
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 106
+# CHECK: jgl [[LABEL]]
+# CHECK: lg [[REG:%r[0-5]]], 0(%r3)
+# CHECK: sg [[REG]], 0(%r4)
+# CHECK: clgfi [[REG]], 107
+# CHECK: jgl [[LABEL]]
+
+branch_blocks = 8
+main_size = 0xffb4
+
+print 'define void @f1(i8 *%base, i64 *%stopa, i64 *%stopb) {'
+print 'entry:'
+print '  br label %before0'
+print ''
+
+for i in xrange(branch_blocks):
+    next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
+    print 'before%d:' % i
+    print '  %%bcur%da = load volatile i64 *%%stopa' % i
+    print '  %%bcur%db = load volatile i64 *%%stopb' % i
+    print '  %%bsub%d = sub i64 %%bcur%da, %%bcur%db' % (i, i, i)
+    print '  %%btest%d = icmp ult i64 %%bsub%d, %d' % (i, i, i + 50)
+    print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
+    print ''
+
+print '%s:' % next
+a, b = 1, 1
+for i in xrange(0, main_size, 6):
+    a, b = b, a + b
+    offset = 4096 + b % 500000
+    value = a % 256
+    print '  %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
+    print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
+
+for i in xrange(branch_blocks):
+    print '  %%acur%da = load volatile i64 *%%stopa' % i
+    print '  %%acur%db = load volatile i64 *%%stopb' % i
+    print '  %%asub%d = sub i64 %%acur%da, %%acur%db' % (i, i, i)
+    print '  %%atest%d = icmp ult i64 %%asub%d, %d' % (i, i, i + 100)
+    print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
+    print ''
+    print 'after%d:' % i
+
+print '  ret void'
+print '}'
diff --git a/test/CodeGen/SystemZ/alias-01.ll b/test/CodeGen/SystemZ/alias-01.ll
new file mode 100644
index 0000000..8839aad
--- /dev/null
+++ b/test/CodeGen/SystemZ/alias-01.ll
@@ -0,0 +1,19 @@
+; Test 32-bit ANDs in which the second operand is variable.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Check that there are no spills.
+define void @f1(<16 x i32> *%src1, <16 x float> *%dest) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: %r15
+; CHECK: br %r14
+  %val = load <16 x i32> *%src1, !tbaa !1
+  %add = add <16 x i32> %val, %val
+  %res = bitcast <16 x i32> %add to <16 x float>
+  store <16 x float> %res, <16 x float> *%dest, !tbaa !2
+  ret void
+}
+
+!0 = metadata !{ metadata !"root" }
+!1 = metadata !{ metadata !"set1", metadata !0 }
+!2 = metadata !{ metadata !"set2", metadata !0 }
diff --git a/test/CodeGen/SystemZ/alloca-02.ll b/test/CodeGen/SystemZ/alloca-02.ll
index b6ed7f7..b5787b1 100644
--- a/test/CodeGen/SystemZ/alloca-02.ll
+++ b/test/CodeGen/SystemZ/alloca-02.ll
@@ -21,18 +21,21 @@ define i64 @f1(i64 %length, i64 %index) {
 ;
 ; CHECK-C-LABEL: f1:
 ; CHECK-C: lgr %r15, [[ADDR:%r[1-5]]]
-; CHECK-C: la [[TMP:%r[1-5]]], 160(%r3,[[ADDR]])
-; CHECK-C: mvi 0([[TMP]]), 2
+; CHECK-C-DAG: la %r2, 160([[ADDR]])
+; CHECK-C-DAG: lhi [[TMP:%r[0-5]]], 2
+; CHECK-C: stc [[TMP]], 0({{%r3,%r2|%r2,%r3}})
 ;
 ; CHECK-D-LABEL: f1:
 ; CHECK-D: lgr %r15, [[ADDR:%r[1-5]]]
-; CHECK-D: la [[TMP:%r[1-5]]], 160(%r3,[[ADDR]])
-; CHECK-D: mvi 4095([[TMP]]), 3
+; CHECK-D-DAG: la %r2, 160([[ADDR]])
+; CHECK-D-DAG: lhi [[TMP:%r[0-5]]], 3
+; CHECK-D: stc [[TMP]], 4095({{%r3,%r2|%r2,%r3}})
 ;
 ; CHECK-E-LABEL: f1:
 ; CHECK-E: lgr %r15, [[ADDR:%r[1-5]]]
-; CHECK-E: la [[TMP:%r[1-5]]], 160(%r3,[[ADDR]])
-; CHECK-E: mviy 4096([[TMP]]), 4
+; CHECK-E-DAG: la %r2, 160([[ADDR]])
+; CHECK-E-DAG: lhi [[TMP:%r[0-5]]], 4
+; CHECK-E: stcy [[TMP]], 4096({{%r3,%r2|%r2,%r3}})
   %a = alloca i8, i64 %length
   store volatile i8 0, i8 *%a
   %b = getelementptr i8 *%a, i64 4095
diff --git a/test/CodeGen/SystemZ/and-08.ll b/test/CodeGen/SystemZ/and-08.ll
new file mode 100644
index 0000000..7ded115
--- /dev/null
+++ b/test/CodeGen/SystemZ/and-08.ll
@@ -0,0 +1,378 @@
+; Test memory-to-memory ANDs.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+@g1src = global i8 1
+@g1dst = global i8 1
+@g2src = global i16 2
+@g2dst = global i16 2
+
+; Test the simple i8 case.
+define void @f1(i8 *%ptr1) {
+; CHECK-LABEL: f1:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %old = load i8 *%ptr2
+  %and = and i8 %val, %old
+  store i8 %and, i8 *%ptr2
+  ret void
+}
+
+; ...and again in reverse.
+define void @f2(i8 *%ptr1) {
+; CHECK-LABEL: f2:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %old = load i8 *%ptr2
+  %and = and i8 %old, %val
+  store i8 %and, i8 *%ptr2
+  ret void
+}
+
+; Test i8 cases where one value is zero-extended to 32 bits and the other
+; sign-extended.
+define void @f3(i8 *%ptr1) {
+; CHECK-LABEL: f3:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %extval = zext i8 %val to i32
+  %old = load i8 *%ptr2
+  %extold = sext i8 %old to i32
+  %and = and i32 %extval, %extold
+  %trunc = trunc i32 %and to i8
+  store i8 %trunc, i8 *%ptr2
+  ret void
+}
+
+; ...and again with the extension types reversed.
+define void @f4(i8 *%ptr1) {
+; CHECK-LABEL: f4:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %extval = sext i8 %val to i32
+  %old = load i8 *%ptr2
+  %extold = zext i8 %old to i32
+  %and = and i32 %extval, %extold
+  %trunc = trunc i32 %and to i8
+  store i8 %trunc, i8 *%ptr2
+  ret void
+}
+
+; ...and again with two sign extensions.
+define void @f5(i8 *%ptr1) {
+; CHECK-LABEL: f5:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %extval = sext i8 %val to i32
+  %old = load i8 *%ptr2
+  %extold = sext i8 %old to i32
+  %and = and i32 %extval, %extold
+  %trunc = trunc i32 %and to i8
+  store i8 %trunc, i8 *%ptr2
+  ret void
+}
+
+; ...and again with two zero extensions.
+define void @f6(i8 *%ptr1) {
+; CHECK-LABEL: f6:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %extval = zext i8 %val to i32
+  %old = load i8 *%ptr2
+  %extold = zext i8 %old to i32
+  %and = and i32 %extval, %extold
+  %trunc = trunc i32 %and to i8
+  store i8 %trunc, i8 *%ptr2
+  ret void
+}
+
+; Test i8 cases where the value is extended to 64 bits (just one case
+; this time).
+define void @f7(i8 *%ptr1) {
+; CHECK-LABEL: f7:
+; CHECK: nc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %extval = sext i8 %val to i64
+  %old = load i8 *%ptr2
+  %extold = zext i8 %old to i64
+  %and = and i64 %extval, %extold
+  %trunc = trunc i64 %and to i8
+  store i8 %trunc, i8 *%ptr2
+  ret void
+}
+
+; Test the simple i16 case.
+define void @f8(i16 *%ptr1) {
+; CHECK-LABEL: f8:
+; CHECK: nc 2(2,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i16 *%ptr1, i64 1
+  %val = load i16 *%ptr1
+  %old = load i16 *%ptr2
+  %and = and i16 %val, %old
+  store i16 %and, i16 *%ptr2
+  ret void
+}
+
+; Test i16 cases where the value is extended to 32 bits.
+define void @f9(i16 *%ptr1) {
+; CHECK-LABEL: f9:
+; CHECK: nc 2(2,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i16 *%ptr1, i64 1
+  %val = load i16 *%ptr1
+  %extval = zext i16 %val to i32
+  %old = load i16 *%ptr2
+  %extold = sext i16 %old to i32
+  %and = and i32 %extval, %extold
+  %trunc = trunc i32 %and to i16
+  store i16 %trunc, i16 *%ptr2
+  ret void
+}
+
+; Test i16 cases where the value is extended to 64 bits.
+define void @f10(i16 *%ptr1) {
+; CHECK-LABEL: f10:
+; CHECK: nc 2(2,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i16 *%ptr1, i64 1
+  %val = load i16 *%ptr1
+  %extval = sext i16 %val to i64
+  %old = load i16 *%ptr2
+  %extold = zext i16 %old to i64
+  %and = and i64 %extval, %extold
+  %trunc = trunc i64 %and to i16
+  store i16 %trunc, i16 *%ptr2
+  ret void
+}
+
+; Test the simple i32 case.
+define void @f11(i32 *%ptr1) {
+; CHECK-LABEL: f11:
+; CHECK: nc 4(4,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i32 *%ptr1, i64 1
+  %val = load i32 *%ptr1
+  %old = load i32 *%ptr2
+  %and = and i32 %old, %val
+  store i32 %and, i32 *%ptr2
+  ret void
+}
+
+; Test i32 cases where the value is extended to 64 bits.
+define void @f12(i32 *%ptr1) {
+; CHECK-LABEL: f12:
+; CHECK: nc 4(4,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i32 *%ptr1, i64 1
+  %val = load i32 *%ptr1
+  %extval = sext i32 %val to i64
+  %old = load i32 *%ptr2
+  %extold = zext i32 %old to i64
+  %and = and i64 %extval, %extold
+  %trunc = trunc i64 %and to i32
+  store i32 %trunc, i32 *%ptr2
+  ret void
+}
+
+; Test the i64 case.
+define void @f13(i64 *%ptr1) {
+; CHECK-LABEL: f13:
+; CHECK: nc 8(8,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; Make sure that we don't use NC if the first load is volatile.
+define void @f14(i64 *%ptr1) {
+; CHECK-LABEL: f14:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load volatile i64 *%ptr1
+  %old = load i64 *%ptr2
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; ...likewise the second.
+define void @f15(i64 *%ptr1) {
+; CHECK-LABEL: f15:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1
+  %old = load volatile i64 *%ptr2
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; ...likewise the store.
+define void @f16(i64 *%ptr1) {
+; CHECK-LABEL: f16:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2
+  %and = and i64 %old, %val
+  store volatile i64 %and, i64 *%ptr2
+  ret void
+}
+
+; Test that NC is not used for aligned loads and stores if there is
+; no way of telling whether they alias.  We don't want to use NC in
+; cases where the addresses could be equal.
+define void @f17(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f17:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; ...but if one of the loads isn't aligned, we can't be sure.
+define void @f18(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f18:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %val = load i64 *%ptr1, align 2
+  %old = load i64 *%ptr2
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; Repeat the previous test with the operands in the opposite order.
+define void @f19(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f19:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %val = load i64 *%ptr1, align 2
+  %old = load i64 *%ptr2
+  %and = and i64 %val, %old
+  store i64 %and, i64 *%ptr2
+  ret void
+}
+
+; ...and again with the other operand being unaligned.
+define void @f20(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f20:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2, align 2
+  %and = and i64 %val, %old
+  store i64 %and, i64 *%ptr2, align 2
+  ret void
+}
+
+; Test a case where there is definite overlap.
+define void @f21(i64 %base) {
+; CHECK-LABEL: f21:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %add = add i64 %base, 1
+  %ptr1 = inttoptr i64 %base to i64 *
+  %ptr2 = inttoptr i64 %add to i64 *
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2, align 1
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2, align 1
+  ret void
+}
+
+; Test that we can use NC for global addresses for i8.
+define void @f22(i8 *%ptr) {
+; CHECK-LABEL: f22:
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g1src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g1dst
+; CHECK: nc 0(1,[[DST]]), 0([[SRC]])
+; CHECK: br %r14
+  %val = load i8 *@g1src
+  %old = load i8 *@g1dst
+  %and = and i8 %val, %old
+  store i8 %and, i8 *@g1dst
+  ret void
+}
+
+; Test that we use NC even where LHRL and STHRL are available.
+define void @f23(i16 *%ptr) {
+; CHECK-LABEL: f23:
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g2src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g2dst
+; CHECK: nc 0(2,[[DST]]), 0([[SRC]])
+; CHECK: br %r14
+  %val = load i16 *@g2src
+  %old = load i16 *@g2dst
+  %and = and i16 %val, %old
+  store i16 %and, i16 *@g2dst
+  ret void
+}
+
+; Test a case where offset disambiguation is enough.
+define void @f24(i64 *%ptr1) {
+; CHECK-LABEL: f24:
+; CHECK: nc 8(8,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1, align 1
+  %old = load i64 *%ptr2, align 1
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2, align 1
+  ret void
+}
+
+; Test a case where TBAA tells us there is no alias.
+define void @f25(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f25:
+; CHECK: nc 0(8,%r3), 0(%r2)
+; CHECK: br %r14
+  %val = load i64 *%ptr1, align 2, !tbaa !3
+  %old = load i64 *%ptr2, align 2, !tbaa !4
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2, align 2, !tbaa !4
+  ret void
+}
+
+; Test a case where TBAA information is present but doesn't help.
+define void @f26(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f26:
+; CHECK-NOT: nc
+; CHECK: br %r14
+  %val = load i64 *%ptr1, align 2, !tbaa !3
+  %old = load i64 *%ptr2, align 2, !tbaa !3
+  %and = and i64 %old, %val
+  store i64 %and, i64 *%ptr2, align 2, !tbaa !3
+  ret void
+}
+
+!0 = metadata !{ metadata !"root" }
+!1 = metadata !{ metadata !"set1", metadata !0 }
+!2 = metadata !{ metadata !"set2", metadata !0 }
+!3 = metadata !{ metadata !1, metadata !1, i64 0}
+!4 = metadata !{ metadata !2, metadata !2, i64 0}
diff --git a/test/CodeGen/SystemZ/args-06.ll b/test/CodeGen/SystemZ/args-06.ll
index a89fe9b..644fcec9 100644
--- a/test/CodeGen/SystemZ/args-06.ll
+++ b/test/CodeGen/SystemZ/args-06.ll
@@ -27,8 +27,8 @@ define i16 @f2(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g) {
 ; CHECK: ar %r2, %r4
 ; CHECK: ar %r2, %r5
 ; CHECK: ar %r2, %r6
-; CHECK: lh {{%r[0-5]}}, 166(%r15)
-; CHECK: lh {{%r[0-5]}}, 174(%r15)
+; CHECK: ah %r2, 166(%r15)
+; CHECK: ah %r2, 174(%r15)
 ; CHECK: br %r14
   %addb = add i16 %a, %b
   %addc = add i16 %addb, %c
diff --git a/test/CodeGen/SystemZ/asm-17.ll b/test/CodeGen/SystemZ/asm-17.ll
index 33234fc..7bc9da3 100644
--- a/test/CodeGen/SystemZ/asm-17.ll
+++ b/test/CodeGen/SystemZ/asm-17.ll
@@ -80,3 +80,26 @@ define float @f7(float %in) {
   call void asm sideeffect "blah", "~{f0},~{cc}"()
   ret float %in
 }
+
+; Test that both registers in a GR128 pair get hoisted.
+define void @f8(i32 %count) {
+; CHECK-LABEL: f8
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lhi %r1, 1
+; CHECK: %loop
+; CHECK-NOT: %r
+; CHECK: blah %r0, %r1
+; CHECK: br %r14
+entry:
+  br label %loop
+
+loop:
+  %this = phi i32 [ %count, %entry ], [ %next, %loop ]
+  call void asm sideeffect "blah $0, $1", "{r0},{r1}" (i32 0, i32 1)
+  %next = sub i32 %this, 1
+  %cmp = icmp ne i32 %next, 0
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/asm-18.ll b/test/CodeGen/SystemZ/asm-18.ll
new file mode 100644
index 0000000..d60654b
--- /dev/null
+++ b/test/CodeGen/SystemZ/asm-18.ll
@@ -0,0 +1,745 @@
+; Test high-word operations, using "h" constraints to force a high
+; register and "r" constraints to force a low register.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Test loads and stores involving mixtures of high and low registers.
+define void @f1(i32 *%ptr1, i32 *%ptr2) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lfh [[REG1:%r[0-5]]], 0(%r2)
+; CHECK-DAG: l [[REG2:%r[0-5]]], 0(%r3)
+; CHECK-DAG: lfh [[REG3:%r[0-5]]], 4096(%r2)
+; CHECK-DAG: ly [[REG4:%r[0-5]]], 524284(%r3)
+; CHECK: blah [[REG1]], [[REG2]], [[REG3]], [[REG4]]
+; CHECK-DAG: stfh [[REG1]], 0(%r2)
+; CHECK-DAG: st [[REG2]], 0(%r3)
+; CHECK-DAG: stfh [[REG3]], 4096(%r2)
+; CHECK-DAG: sty [[REG4]], 524284(%r3)
+; CHECK: br %r14
+  %ptr3 = getelementptr i32 *%ptr1, i64 1024
+  %ptr4 = getelementptr i32 *%ptr2, i64 131071
+  %old1 = load i32 *%ptr1
+  %old2 = load i32 *%ptr2
+  %old3 = load i32 *%ptr3
+  %old4 = load i32 *%ptr4
+  %res = call { i32, i32, i32, i32 } asm "blah $0, $1, $2, $3",
+              "=h,=r,=h,=r,0,1,2,3"(i32 %old1, i32 %old2, i32 %old3, i32 %old4)
+  %new1 = extractvalue { i32, i32, i32, i32 } %res, 0
+  %new2 = extractvalue { i32, i32, i32, i32 } %res, 1
+  %new3 = extractvalue { i32, i32, i32, i32 } %res, 2
+  %new4 = extractvalue { i32, i32, i32, i32 } %res, 3
+  store i32 %new1, i32 *%ptr1
+  store i32 %new2, i32 *%ptr2
+  store i32 %new3, i32 *%ptr3
+  store i32 %new4, i32 *%ptr4
+  ret void
+}
+
+; Test moves involving mixtures of high and low registers.
+define i32 @f2(i32 %old) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: risbhg [[REG1:%r[0-5]]], %r2, 0, 159, 32
+; CHECK-DAG: lr %r3, %r2
+; CHECK: stepa [[REG1]], %r2, %r3
+; CHECK: risbhg {{%r[0-5]}}, [[REG1]], 0, 159, 0
+; CHECK: stepb [[REG2:%r[0-5]]]
+; CHECK: risblg %r2, [[REG2]], 0, 159, 32
+; CHECK: br %r14
+  %tmp = call i32 asm "stepa $1, $2, $3",
+                      "=h,0,{r2},{r3}"(i32 %old, i32 %old, i32 %old)
+  %new = call i32 asm "stepb $1, $2", "=&h,0,h"(i32 %tmp, i32 %tmp)
+  ret i32 %new
+}
+
+; Test sign-extending 8-bit loads into mixtures of high and low registers.
+define void @f3(i8 *%ptr1, i8 *%ptr2) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: lbh [[REG1:%r[0-5]]], 0(%r2)
+; CHECK-DAG: lb [[REG2:%r[0-5]]], 0(%r3)
+; CHECK-DAG: lbh [[REG3:%r[0-5]]], 4096(%r2)
+; CHECK-DAG: lb [[REG4:%r[0-5]]], 524287(%r3)
+; CHECK: blah [[REG1]], [[REG2]]
+; CHECK: br %r14
+  %ptr3 = getelementptr i8 *%ptr1, i64 4096
+  %ptr4 = getelementptr i8 *%ptr2, i64 524287
+  %val1 = load i8 *%ptr1
+  %val2 = load i8 *%ptr2
+  %val3 = load i8 *%ptr3
+  %val4 = load i8 *%ptr4
+  %ext1 = sext i8 %val1 to i32
+  %ext2 = sext i8 %val2 to i32
+  %ext3 = sext i8 %val3 to i32
+  %ext4 = sext i8 %val4 to i32
+  call void asm sideeffect "blah $0, $1, $2, $3",
+                           "h,r,h,r"(i32 %ext1, i32 %ext2, i32 %ext3, i32 %ext4)
+  ret void
+}
+
+; Test sign-extending 16-bit loads into mixtures of high and low registers.
+define void @f4(i16 *%ptr1, i16 *%ptr2) {
+; CHECK-LABEL: f4:
+; CHECK-DAG: lhh [[REG1:%r[0-5]]], 0(%r2)
+; CHECK-DAG: lh [[REG2:%r[0-5]]], 0(%r3)
+; CHECK-DAG: lhh [[REG3:%r[0-5]]], 4096(%r2)
+; CHECK-DAG: lhy [[REG4:%r[0-5]]], 524286(%r3)
+; CHECK: blah [[REG1]], [[REG2]]
+; CHECK: br %r14
+  %ptr3 = getelementptr i16 *%ptr1, i64 2048
+  %ptr4 = getelementptr i16 *%ptr2, i64 262143
+  %val1 = load i16 *%ptr1
+  %val2 = load i16 *%ptr2
+  %val3 = load i16 *%ptr3
+  %val4 = load i16 *%ptr4
+  %ext1 = sext i16 %val1 to i32
+  %ext2 = sext i16 %val2 to i32
+  %ext3 = sext i16 %val3 to i32
+  %ext4 = sext i16 %val4 to i32
+  call void asm sideeffect "blah $0, $1, $2, $3",
+                           "h,r,h,r"(i32 %ext1, i32 %ext2, i32 %ext3, i32 %ext4)
+  ret void
+}
+
+; Test zero-extending 8-bit loads into mixtures of high and low registers.
+define void @f5(i8 *%ptr1, i8 *%ptr2) {
+; CHECK-LABEL: f5:
+; CHECK-DAG: llch [[REG1:%r[0-5]]], 0(%r2)
+; CHECK-DAG: llc [[REG2:%r[0-5]]], 0(%r3)
+; CHECK-DAG: llch [[REG3:%r[0-5]]], 4096(%r2)
+; CHECK-DAG: llc [[REG4:%r[0-5]]], 524287(%r3)
+; CHECK: blah [[REG1]], [[REG2]]
+; CHECK: br %r14
+  %ptr3 = getelementptr i8 *%ptr1, i64 4096
+  %ptr4 = getelementptr i8 *%ptr2, i64 524287
+  %val1 = load i8 *%ptr1
+  %val2 = load i8 *%ptr2
+  %val3 = load i8 *%ptr3
+  %val4 = load i8 *%ptr4
+  %ext1 = zext i8 %val1 to i32
+  %ext2 = zext i8 %val2 to i32
+  %ext3 = zext i8 %val3 to i32
+  %ext4 = zext i8 %val4 to i32
+  call void asm sideeffect "blah $0, $1, $2, $3",
+                           "h,r,h,r"(i32 %ext1, i32 %ext2, i32 %ext3, i32 %ext4)
+  ret void
+}
+
+; Test zero-extending 16-bit loads into mixtures of high and low registers.
+define void @f6(i16 *%ptr1, i16 *%ptr2) {
+; CHECK-LABEL: f6:
+; CHECK-DAG: llhh [[REG1:%r[0-5]]], 0(%r2)
+; CHECK-DAG: llh [[REG2:%r[0-5]]], 0(%r3)
+; CHECK-DAG: llhh [[REG3:%r[0-5]]], 4096(%r2)
+; CHECK-DAG: llh [[REG4:%r[0-5]]], 524286(%r3)
+; CHECK: blah [[REG1]], [[REG2]]
+; CHECK: br %r14
+  %ptr3 = getelementptr i16 *%ptr1, i64 2048
+  %ptr4 = getelementptr i16 *%ptr2, i64 262143
+  %val1 = load i16 *%ptr1
+  %val2 = load i16 *%ptr2
+  %val3 = load i16 *%ptr3
+  %val4 = load i16 *%ptr4
+  %ext1 = zext i16 %val1 to i32
+  %ext2 = zext i16 %val2 to i32
+  %ext3 = zext i16 %val3 to i32
+  %ext4 = zext i16 %val4 to i32
+  call void asm sideeffect "blah $0, $1, $2, $3",
+                           "h,r,h,r"(i32 %ext1, i32 %ext2, i32 %ext3, i32 %ext4)
+  ret void
+}
+
+; Test truncating stores of high and low registers into 8-bit memory.
+define void @f7(i8 *%ptr1, i8 *%ptr2) {
+; CHECK-LABEL: f7:
+; CHECK: blah [[REG1:%r[0-5]]], [[REG2:%r[0-5]]]
+; CHECK-DAG: stch [[REG1]], 0(%r2)
+; CHECK-DAG: stc [[REG2]], 0(%r3)
+; CHECK-DAG: stch [[REG1]], 4096(%r2)
+; CHECK-DAG: stcy [[REG2]], 524287(%r3)
+; CHECK: br %r14
+  %res = call { i32, i32 } asm "blah $0, $1", "=h,=r"()
+  %res1 = extractvalue { i32, i32 } %res, 0
+  %res2 = extractvalue { i32, i32 } %res, 1
+  %trunc1 = trunc i32 %res1 to i8
+  %trunc2 = trunc i32 %res2 to i8
+  %ptr3 = getelementptr i8 *%ptr1, i64 4096
+  %ptr4 = getelementptr i8 *%ptr2, i64 524287
+  store i8 %trunc1, i8 *%ptr1
+  store i8 %trunc2, i8 *%ptr2
+  store i8 %trunc1, i8 *%ptr3
+  store i8 %trunc2, i8 *%ptr4
+  ret void
+}
+
+; Test truncating stores of high and low registers into 16-bit memory.
+define void @f8(i16 *%ptr1, i16 *%ptr2) {
+; CHECK-LABEL: f8:
+; CHECK: blah [[REG1:%r[0-5]]], [[REG2:%r[0-5]]]
+; CHECK-DAG: sthh [[REG1]], 0(%r2)
+; CHECK-DAG: sth [[REG2]], 0(%r3)
+; CHECK-DAG: sthh [[REG1]], 4096(%r2)
+; CHECK-DAG: sthy [[REG2]], 524286(%r3)
+; CHECK: br %r14
+  %res = call { i32, i32 } asm "blah $0, $1", "=h,=r"()
+  %res1 = extractvalue { i32, i32 } %res, 0
+  %res2 = extractvalue { i32, i32 } %res, 1
+  %trunc1 = trunc i32 %res1 to i16
+  %trunc2 = trunc i32 %res2 to i16
+  %ptr3 = getelementptr i16 *%ptr1, i64 2048
+  %ptr4 = getelementptr i16 *%ptr2, i64 262143
+  store i16 %trunc1, i16 *%ptr1
+  store i16 %trunc2, i16 *%ptr2
+  store i16 %trunc1, i16 *%ptr3
+  store i16 %trunc2, i16 *%ptr4
+  ret void
+}
+
+; Test zero extensions from 8 bits between mixtures of high and low registers.
+define i32 @f9(i8 %val1, i8 %val2) {
+; CHECK-LABEL: f9:
+; CHECK-DAG: risbhg [[REG1:%r[0-5]]], %r2, 24, 159, 32
+; CHECK-DAG: llcr [[REG2:%r[0-5]]], %r3
+; CHECK: stepa [[REG1]], [[REG2]]
+; CHECK: risbhg [[REG3:%r[0-5]]], [[REG1]], 24, 159, 0
+; CHECK: stepb [[REG3]]
+; CHECK: risblg %r2, [[REG3]], 24, 159, 32
+; CHECK: br %r14
+  %ext1 = zext i8 %val1 to i32
+  %ext2 = zext i8 %val2 to i32
+  %val3 = call i8 asm sideeffect "stepa $0, $1", "=h,0,r"(i32 %ext1, i32 %ext2)
+  %ext3 = zext i8 %val3 to i32
+  %val4 = call i8 asm sideeffect "stepb $0", "=h,0"(i32 %ext3)
+  %ext4 = zext i8 %val4 to i32
+  ret i32 %ext4
+}
+
+; Test zero extensions from 16 bits between mixtures of high and low registers.
+define i32 @f10(i16 %val1, i16 %val2) {
+; CHECK-LABEL: f10:
+; CHECK-DAG: risbhg [[REG1:%r[0-5]]], %r2, 16, 159, 32
+; CHECK-DAG: llhr [[REG2:%r[0-5]]], %r3
+; CHECK: stepa [[REG1]], [[REG2]]
+; CHECK: risbhg [[REG3:%r[0-5]]], [[REG1]], 16, 159, 0
+; CHECK: stepb [[REG3]]
+; CHECK: risblg %r2, [[REG3]], 16, 159, 32
+; CHECK: br %r14
+  %ext1 = zext i16 %val1 to i32
+  %ext2 = zext i16 %val2 to i32
+  %val3 = call i16 asm sideeffect "stepa $0, $1", "=h,0,r"(i32 %ext1, i32 %ext2)
+  %ext3 = zext i16 %val3 to i32
+  %val4 = call i16 asm sideeffect "stepb $0", "=h,0"(i32 %ext3)
+  %ext4 = zext i16 %val4 to i32
+  ret i32 %ext4
+}
+
+; Test loads of 16-bit constants into mixtures of high and low registers.
+define void @f11() {
+; CHECK-LABEL: f11:
+; CHECK-DAG: iihf [[REG1:%r[0-5]]], 4294934529
+; CHECK-DAG: lhi [[REG2:%r[0-5]]], -32768
+; CHECK-DAG: llihl [[REG3:%r[0-5]]], 32766
+; CHECK-DAG: lhi [[REG4:%r[0-5]]], 32767
+; CHECK: blah [[REG1]], [[REG2]], [[REG3]], [[REG4]]
+; CHECK: br %r14
+  call void asm sideeffect "blah $0, $1, $2, $3",
+                           "h,r,h,r"(i32 -32767, i32 -32768,
+                                     i32 32766, i32 32767)
+  ret void
+}
+
+; Test loads of unsigned constants into mixtures of high and low registers.
+; For stepc, we expect the h and r operands to be paired by the register
+; allocator.  It doesn't really matter which comes first: LLILL/IIHF would
+; be just as good.
+define void @f12() {
+; CHECK-LABEL: f12:
+; CHECK-DAG: llihl [[REG1:%r[0-5]]], 32768
+; CHECK-DAG: llihl [[REG2:%r[0-5]]], 65535
+; CHECK-DAG: llihh [[REG3:%r[0-5]]], 1
+; CHECK-DAG: llihh [[REG4:%r[0-5]]], 65535
+; CHECK: stepa [[REG1]], [[REG2]], [[REG3]], [[REG4]]
+; CHECK-DAG: llill [[REG1:%r[0-5]]], 32769
+; CHECK-DAG: llill [[REG2:%r[0-5]]], 65534
+; CHECK-DAG: llilh [[REG3:%r[0-5]]], 2
+; CHECK-DAG: llilh [[REG4:%r[0-5]]], 65534
+; CHECK: stepb [[REG1]], [[REG2]], [[REG3]], [[REG4]]
+; CHECK-DAG: llihl [[REG1:%r[0-5]]], 32770
+; CHECK-DAG: iilf [[REG1]], 65533
+; CHECK-DAG: llihh [[REG2:%r[0-5]]], 4
+; CHECK-DAG: iilf [[REG2]], 524288
+; CHECK: stepc [[REG1]], [[REG1]], [[REG2]], [[REG2]]
+; CHECK-DAG: iihf [[REG1:%r[0-5]]], 3294967296
+; CHECK-DAG: iilf [[REG2:%r[0-5]]], 4294567296
+; CHECK-DAG: iihf [[REG3:%r[0-5]]], 1000000000
+; CHECK-DAG: iilf [[REG4:%r[0-5]]], 400000
+; CHECK: stepd [[REG1]], [[REG2]], [[REG3]], [[REG4]]
+; CHECK: br %r14
+  call void asm sideeffect "stepa $0, $1, $2, $3",
+                           "h,h,h,h"(i32 32768, i32 65535,
+                                     i32 65536, i32 -65536)
+  call void asm sideeffect "stepb $0, $1, $2, $3",
+                           "r,r,r,r"(i32 32769, i32 65534,
+                                     i32 131072, i32 -131072)
+  call void asm sideeffect "stepc $0, $1, $2, $3",
+                           "h,r,h,r"(i32 32770, i32 65533,
+                                     i32 262144, i32 524288)
+  call void asm sideeffect "stepd $0, $1, $2, $3",
+                           "h,r,h,r"(i32 -1000000000, i32 -400000,
+                                     i32 1000000000, i32 400000)
+  ret void
+}
+
+; Test selects involving high registers.
+define void @f13(i32 %x, i32 %y) {
+; CHECK-LABEL: f13:
+; CHECK: llihl [[REG:%r[0-5]]], 0
+; CHECK: cije %r2, 0
+; CHECK: iihf [[REG]], 2102030405
+; CHECK: blah [[REG]]
+; CHECK: br %r14
+  %cmp = icmp eq i32 %x, 0
+  %val = select i1 %cmp, i32 0, i32 2102030405
+  call void asm sideeffect "blah $0", "h"(i32 %val)
+  ret void
+}
+
+; Test selects involving low registers.
+define void @f14(i32 %x, i32 %y) {
+; CHECK-LABEL: f14:
+; CHECK: lhi [[REG:%r[0-5]]], 0
+; CHECK: cije %r2, 0
+; CHECK: iilf [[REG]], 2102030405
+; CHECK: blah [[REG]]
+; CHECK: br %r14
+  %cmp = icmp eq i32 %x, 0
+  %val = select i1 %cmp, i32 0, i32 2102030405
+  call void asm sideeffect "blah $0", "r"(i32 %val)
+  ret void
+}
+
+; Test immediate insertion involving high registers.
+define void @f15() {
+; CHECK-LABEL: f15:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: iihh [[REG]], 4660
+; CHECK: stepb [[REG]]
+; CHECK: iihl [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %and1 = and i32 %res1, 65535
+  %or1 = or i32 %and1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %or1)
+  %and2 = and i32 %res2, -65536
+  %or2 = or i32 %and2, 34661
+  call void asm sideeffect "stepc $0", "h"(i32 %or2)
+  ret void
+}
+
+; Test immediate insertion involving low registers.
+define void @f16() {
+; CHECK-LABEL: f16:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: iilh [[REG]], 4660
+; CHECK: stepb [[REG]]
+; CHECK: iill [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %and1 = and i32 %res1, 65535
+  %or1 = or i32 %and1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %or1)
+  %and2 = and i32 %res2, -65536
+  %or2 = or i32 %and2, 34661
+  call void asm sideeffect "stepc $0", "r"(i32 %or2)
+  ret void
+}
+
+; Test immediate OR involving high registers.
+define void @f17() {
+; CHECK-LABEL: f17:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: oihh [[REG]], 4660
+; CHECK: stepb [[REG]]
+; CHECK: oihl [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: oihf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %or1 = or i32 %res1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %or1)
+  %or2 = or i32 %res2, 34661
+  %res3 = call i32 asm "stepc $0, $1", "=h,h"(i32 %or2)
+  %or3 = or i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "h"(i32 %or3)
+  ret void
+}
+
+; Test immediate OR involving low registers.
+define void @f18() {
+; CHECK-LABEL: f18:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: oilh [[REG]], 4660
+; CHECK: stepb [[REG]]
+; CHECK: oill [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: oilf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %or1 = or i32 %res1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %or1)
+  %or2 = or i32 %res2, 34661
+  %res3 = call i32 asm "stepc $0, $1", "=r,r"(i32 %or2)
+  %or3 = or i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "r"(i32 %or3)
+  ret void
+}
+
+; Test immediate XOR involving high registers.
+define void @f19() {
+; CHECK-LABEL: f19:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: xihf [[REG]], 305397760
+; CHECK: stepb [[REG]]
+; CHECK: xihf [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: xihf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %xor1 = xor i32 %res1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %xor1)
+  %xor2 = xor i32 %res2, 34661
+  %res3 = call i32 asm "stepc $0, $1", "=h,h"(i32 %xor2)
+  %xor3 = xor i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "h"(i32 %xor3)
+  ret void
+}
+
+; Test immediate XOR involving low registers.
+define void @f20() {
+; CHECK-LABEL: f20:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: xilf [[REG]], 305397760
+; CHECK: stepb [[REG]]
+; CHECK: xilf [[REG]], 34661
+; CHECK: stepc [[REG]]
+; CHECK: xilf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %xor1 = xor i32 %res1, 305397760
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %xor1)
+  %xor2 = xor i32 %res2, 34661
+  %res3 = call i32 asm "stepc $0, $1", "=r,r"(i32 %xor2)
+  %xor3 = xor i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "r"(i32 %xor3)
+  ret void
+}
+
+; Test two-operand immediate AND involving high registers.
+define void @f21() {
+; CHECK-LABEL: f21:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: nihh [[REG]], 4096
+; CHECK: stepb [[REG]]
+; CHECK: nihl [[REG]], 57536
+; CHECK: stepc [[REG]]
+; CHECK: nihf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %and1 = and i32 %res1, 268500991
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %and1)
+  %and2 = and i32 %res2, -8000
+  %res3 = call i32 asm "stepc $0, $1", "=h,h"(i32 %and2)
+  %and3 = and i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "h"(i32 %and3)
+  ret void
+}
+
+; Test two-operand immediate AND involving low registers.
+define void @f22() {
+; CHECK-LABEL: f22:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: nilh [[REG]], 4096
+; CHECK: stepb [[REG]]
+; CHECK: nill [[REG]], 57536
+; CHECK: stepc [[REG]]
+; CHECK: nilf [[REG]], 12345678
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %and1 = and i32 %res1, 268500991
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %and1)
+  %and2 = and i32 %res2, -8000
+  %res3 = call i32 asm "stepc $0, $1", "=r,r"(i32 %and2)
+  %and3 = and i32 %res3, 12345678
+  call void asm sideeffect "stepd $0", "r"(i32 %and3)
+  ret void
+}
+
+; Test three-operand immediate AND involving mixtures of low and high registers.
+define i32 @f23(i32 %old) {
+; CHECK-LABEL: f23:
+; CHECK-DAG: risblg [[REG1:%r[0-5]]], %r2, 28, 158, 0
+; CHECK-DAG: risbhg [[REG2:%r[0-5]]], %r2, 24, 158, 32
+; CHECK: stepa %r2, [[REG1]], [[REG2]]
+; CHECK-DAG: risbhg [[REG3:%r[0-5]]], [[REG2]], 25, 159, 0
+; CHECK-DAG: risblg %r2, [[REG2]], 24, 152, 32
+; CHECK: stepb [[REG2]], [[REG3]], %r2
+; CHECK: br %r14
+  %and1 = and i32 %old, 14
+  %and2 = and i32 %old, 254
+  %res1 = call i32 asm "stepa $1, $2, $3",
+                       "=h,r,r,0"(i32 %old, i32 %and1, i32 %and2)
+  %and3 = and i32 %res1, 127
+  %and4 = and i32 %res1, 128
+  %res2 = call i32 asm "stepb $1, $2, $3",
+                       "=r,h,h,0"(i32 %res1, i32 %and3, i32 %and4)
+  ret i32 %res2
+}
+
+; Test RISB[LH]G insertions involving mixtures of high and low registers.
+define i32 @f24(i32 %old) {
+; CHECK-LABEL: f24:
+; CHECK-DAG: risblg [[REG1:%r[0-5]]], %r2, 28, 158, 1
+; CHECK-DAG: risbhg [[REG2:%r[0-5]]], %r2, 24, 158, 29
+; CHECK: stepa %r2, [[REG1]], [[REG2]]
+; CHECK-DAG: risbhg [[REG3:%r[0-5]]], [[REG2]], 25, 159, 62
+; CHECK-DAG: risblg %r2, [[REG2]], 24, 152, 37
+; CHECK: stepb [[REG2]], [[REG3]], %r2
+; CHECK: br %r14
+  %shift1 = shl i32 %old, 1
+  %and1 = and i32 %shift1, 14
+  %shift2 = lshr i32 %old, 3
+  %and2 = and i32 %shift2, 254
+  %res1 = call i32 asm "stepa $1, $2, $3",
+                       "=h,r,r,0"(i32 %old, i32 %and1, i32 %and2)
+  %shift3 = lshr i32 %res1, 2
+  %and3 = and i32 %shift3, 127
+  %shift4 = shl i32 %res1, 5
+  %and4 = and i32 %shift4, 128
+  %res2 = call i32 asm "stepb $1, $2, $3",
+                       "=r,h,h,0"(i32 %res1, i32 %and3, i32 %and4)
+  ret i32 %res2
+}
+
+; Test TMxx involving mixtures of high and low registers.
+define i32 @f25(i32 %old) {
+; CHECK-LABEL: f25:
+; CHECK-DAG: tmll %r2, 1
+; CHECK-DAG: tmlh %r2, 1
+; CHECK: stepa [[REG1:%r[0-5]]],
+; CHECK-DAG: tmhl [[REG1]], 1
+; CHECK-DAG: tmhh [[REG1]], 1
+; CHECK: stepb %r2,
+; CHECK: br %r14
+  %and1 = and i32 %old, 1
+  %and2 = and i32 %old, 65536
+  %cmp1 = icmp eq i32 %and1, 0
+  %cmp2 = icmp eq i32 %and2, 0
+  %sel1 = select i1 %cmp1, i32 100, i32 200
+  %sel2 = select i1 %cmp2, i32 100, i32 200
+  %res1 = call i32 asm "stepa $0, $1, $2",
+                       "=h,r,r"(i32 %sel1, i32 %sel2)
+  %and3 = and i32 %res1, 1
+  %and4 = and i32 %res1, 65536
+  %cmp3 = icmp eq i32 %and3, 0
+  %cmp4 = icmp eq i32 %and4, 0
+  %sel3 = select i1 %cmp3, i32 100, i32 200
+  %sel4 = select i1 %cmp4, i32 100, i32 200
+  %res2 = call i32 asm "stepb $0, $1, $2",
+                       "=r,h,h"(i32 %sel3, i32 %sel4)
+  ret i32 %res2
+}
+
+; Test two-operand halfword immediate addition involving high registers.
+define void @f26() {
+; CHECK-LABEL: f26:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: aih [[REG]], -32768
+; CHECK: stepb [[REG]]
+; CHECK: aih [[REG]], 1
+; CHECK: stepc [[REG]]
+; CHECK: aih [[REG]], 32767
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %add1 = add i32 %res1, -32768
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %add1)
+  %add2 = add i32 %res2, 1
+  %res3 = call i32 asm "stepc $0, $1", "=h,h"(i32 %add2)
+  %add3 = add i32 %res3, 32767
+  call void asm sideeffect "stepd $0", "h"(i32 %add3)
+  ret void
+}
+
+; Test two-operand halfword immediate addition involving low registers.
+define void @f27() {
+; CHECK-LABEL: f27:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: ahi [[REG]], -32768
+; CHECK: stepb [[REG]]
+; CHECK: ahi [[REG]], 1
+; CHECK: stepc [[REG]]
+; CHECK: ahi [[REG]], 32767
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %add1 = add i32 %res1, -32768
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %add1)
+  %add2 = add i32 %res2, 1
+  %res3 = call i32 asm "stepc $0, $1", "=r,r"(i32 %add2)
+  %add3 = add i32 %res3, 32767
+  call void asm sideeffect "stepd $0", "r"(i32 %add3)
+  ret void
+}
+
+; Test three-operand halfword immediate addition involving mixtures of low
+; and high registers.  RISBHG/AIH would be OK too, instead of AHIK/RISBHG.
+define i32 @f28(i32 %old) {
+; CHECK-LABEL: f28:
+; CHECK: ahik [[REG1:%r[0-5]]], %r2, 14
+; CHECK: stepa %r2, [[REG1]]
+; CHECK: ahik [[TMP:%r[0-5]]], [[REG1]], 254
+; CHECK: risbhg [[REG2:%r[0-5]]], [[TMP]], 0, 159, 32
+; CHECK: stepb [[REG1]], [[REG2]]
+; CHECK: risbhg [[REG3:%r[0-5]]], [[REG2]], 0, 159, 0
+; CHECK: aih [[REG3]], 127
+; CHECK: stepc [[REG2]], [[REG3]]
+; CHECK: risblg %r2, [[REG3]], 0, 159, 32
+; CHECK: ahi %r2, 128
+; CHECK: stepd [[REG3]], %r2
+; CHECK: br %r14
+  %add1 = add i32 %old, 14
+  %res1 = call i32 asm "stepa $1, $2",
+                       "=r,r,0"(i32 %old, i32 %add1)
+  %add2 = add i32 %res1, 254
+  %res2 = call i32 asm "stepb $1, $2",
+                       "=h,r,0"(i32 %res1, i32 %add2)
+  %add3 = add i32 %res2, 127
+  %res3 = call i32 asm "stepc $1, $2",
+                       "=h,h,0"(i32 %res2, i32 %add3)
+  %add4 = add i32 %res3, 128
+  %res4 = call i32 asm "stepd $1, $2",
+                       "=r,h,0"(i32 %res3, i32 %add4)
+  ret i32 %res4
+}
+
+; Test large immediate addition involving high registers.
+define void @f29() {
+; CHECK-LABEL: f29:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: aih [[REG]], -32769
+; CHECK: stepb [[REG]]
+; CHECK: aih [[REG]], 32768
+; CHECK: stepc [[REG]]
+; CHECK: aih [[REG]], 1000000000
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %add1 = add i32 %res1, -32769
+  %res2 = call i32 asm "stepb $0, $1", "=h,h"(i32 %add1)
+  %add2 = add i32 %res2, 32768
+  %res3 = call i32 asm "stepc $0, $1", "=h,h"(i32 %add2)
+  %add3 = add i32 %res3, 1000000000
+  call void asm sideeffect "stepd $0", "h"(i32 %add3)
+  ret void
+}
+
+; Test large immediate addition involving low registers.
+define void @f30() {
+; CHECK-LABEL: f30:
+; CHECK: stepa [[REG:%r[0-5]]]
+; CHECK: afi [[REG]], -32769
+; CHECK: stepb [[REG]]
+; CHECK: afi [[REG]], 32768
+; CHECK: stepc [[REG]]
+; CHECK: afi [[REG]], 1000000000
+; CHECK: stepd [[REG]]
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %add1 = add i32 %res1, -32769
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %add1)
+  %add2 = add i32 %res2, 32768
+  %res3 = call i32 asm "stepc $0, $1", "=r,r"(i32 %add2)
+  %add3 = add i32 %res3, 1000000000
+  call void asm sideeffect "stepd $0", "r"(i32 %add3)
+  ret void
+}
+
+; Test large immediate comparison involving high registers.
+define i32 @f31() {
+; CHECK-LABEL: f31:
+; CHECK: stepa [[REG1:%r[0-5]]]
+; CHECK: cih [[REG1]], 1000000000
+; CHECK: stepb [[REG2:%r[0-5]]]
+; CHECK: clih [[REG2]], 1000000000
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %cmp1 = icmp sle i32 %res1, 1000000000
+  %sel1 = select i1 %cmp1, i32 0, i32 1
+  %res2 = call i32 asm "stepb $0, $1", "=h,r"(i32 %sel1)
+  %cmp2 = icmp ule i32 %res2, 1000000000
+  %sel2 = select i1 %cmp2, i32 0, i32 1
+  ret i32 %sel2
+}
+
+; Test large immediate comparison involving low registers.
+define i32 @f32() {
+; CHECK-LABEL: f32:
+; CHECK: stepa [[REG1:%r[0-5]]]
+; CHECK: cfi [[REG1]], 1000000000
+; CHECK: stepb [[REG2:%r[0-5]]]
+; CHECK: clfi [[REG2]], 1000000000
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %cmp1 = icmp sle i32 %res1, 1000000000
+  %sel1 = select i1 %cmp1, i32 0, i32 1
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %sel1)
+  %cmp2 = icmp ule i32 %res2, 1000000000
+  %sel2 = select i1 %cmp2, i32 0, i32 1
+  ret i32 %sel2
+}
+
+; Test memory comparison involving high registers.
+define void @f33(i32 *%ptr1, i32 *%ptr2) {
+; CHECK-LABEL: f33:
+; CHECK: stepa [[REG1:%r[0-5]]]
+; CHECK: chf [[REG1]], 0(%r2)
+; CHECK: stepb [[REG2:%r[0-5]]]
+; CHECK: clhf [[REG2]], 0(%r3)
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=h"()
+  %load1 = load i32 *%ptr1
+  %cmp1 = icmp sle i32 %res1, %load1
+  %sel1 = select i1 %cmp1, i32 0, i32 1
+  %res2 = call i32 asm "stepb $0, $1", "=h,r"(i32 %sel1)
+  %load2 = load i32 *%ptr2
+  %cmp2 = icmp ule i32 %res2, %load2
+  %sel2 = select i1 %cmp2, i32 0, i32 1
+  store i32 %sel2, i32 *%ptr1
+  ret void
+}
+
+; Test memory comparison involving low registers.
+define void @f34(i32 *%ptr1, i32 *%ptr2) {
+; CHECK-LABEL: f34:
+; CHECK: stepa [[REG1:%r[0-5]]]
+; CHECK: c [[REG1]], 0(%r2)
+; CHECK: stepb [[REG2:%r[0-5]]]
+; CHECK: cl [[REG2]], 0(%r3)
+; CHECK: br %r14
+  %res1 = call i32 asm "stepa $0", "=r"()
+  %load1 = load i32 *%ptr1
+  %cmp1 = icmp sle i32 %res1, %load1
+  %sel1 = select i1 %cmp1, i32 0, i32 1
+  %res2 = call i32 asm "stepb $0, $1", "=r,r"(i32 %sel1)
+  %load2 = load i32 *%ptr2
+  %cmp2 = icmp ule i32 %res2, %load2
+  %sel2 = select i1 %cmp2, i32 0, i32 1
+  store i32 %sel2, i32 *%ptr1
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll
index a15fe57..2b750c4 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-01.ll
@@ -91,8 +91,7 @@ define i8 @f3(i8 *%src, i8 %b) {
 ; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
 ; CHECK: [[LOOP:\.[^:]*]]:
 ; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
-; CHECK: clr [[ROT]], %r3
-; CHECK: jle [[KEEP:\..*]]
+; CHECK: clrjle [[ROT]], %r3, [[KEEP:\..*]]
 ; CHECK: risbg [[ROT]], %r3, 32, 39, 0
 ; CHECK: [[KEEP]]:
 ; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
@@ -112,7 +111,7 @@ define i8 @f3(i8 *%src, i8 %b) {
 ; CHECK-SHIFT2-LABEL: f3:
 ; CHECK-SHIFT2: sll %r3, 24
 ; CHECK-SHIFT2: rll
-; CHECK-SHIFT2: clr {{%r[0-9]+}}, %r3
+; CHECK-SHIFT2: clrjle {{%r[0-9]+}}, %r3,
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: br %r14
@@ -128,8 +127,7 @@ define i8 @f4(i8 *%src, i8 %b) {
 ; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
 ; CHECK: [[LOOP:\.[^:]*]]:
 ; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
-; CHECK: clr [[ROT]], %r3
-; CHECK: jhe [[KEEP:\..*]]
+; CHECK: clrjhe [[ROT]], %r3, [[KEEP:\..*]]
 ; CHECK: risbg [[ROT]], %r3, 32, 39, 0
 ; CHECK: [[KEEP]]:
 ; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
@@ -149,7 +147,7 @@ define i8 @f4(i8 *%src, i8 %b) {
 ; CHECK-SHIFT2-LABEL: f4:
 ; CHECK-SHIFT2: sll %r3, 24
 ; CHECK-SHIFT2: rll
-; CHECK-SHIFT2: clr {{%r[0-9]+}}, %r3
+; CHECK-SHIFT2: clrjhe {{%r[0-9]+}}, %r3,
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: br %r14
@@ -196,7 +194,7 @@ define i8 @f6(i8 *%src) {
 define i8 @f7(i8 *%src) {
 ; CHECK-LABEL: f7:
 ; CHECK: llilh [[SRC2:%r[0-9]+]], 256
-; CHECK: clr [[ROT:%r[0-9]+]], [[SRC2]]
+; CHECK: clrjle [[ROT:%r[0-9]+]], [[SRC2]],
 ; CHECK: risbg [[ROT]], [[SRC2]], 32, 39, 0
 ; CHECK: br %r14
 ;
@@ -213,7 +211,7 @@ define i8 @f7(i8 *%src) {
 define i8 @f8(i8 *%src) {
 ; CHECK-LABEL: f8:
 ; CHECK: llilh [[SRC2:%r[0-9]+]], 65024
-; CHECK: clr [[ROT:%r[0-9]+]], [[SRC2]]
+; CHECK: clrjhe [[ROT:%r[0-9]+]], [[SRC2]],
 ; CHECK: risbg [[ROT]], [[SRC2]], 32, 39, 0
 ; CHECK: br %r14
 ;
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll
index c0ae883..98ffedf 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-02.ll
@@ -91,8 +91,7 @@ define i16 @f3(i16 *%src, i16 %b) {
 ; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
 ; CHECK: [[LOOP:\.[^:]*]]:
 ; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
-; CHECK: clr [[ROT]], %r3
-; CHECK: jle [[KEEP:\..*]]
+; CHECK: clrjle [[ROT]], %r3, [[KEEP:\..*]]
 ; CHECK: risbg [[ROT]], %r3, 32, 47, 0
 ; CHECK: [[KEEP]]:
 ; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
@@ -112,7 +111,7 @@ define i16 @f3(i16 *%src, i16 %b) {
 ; CHECK-SHIFT2-LABEL: f3:
 ; CHECK-SHIFT2: sll %r3, 16
 ; CHECK-SHIFT2: rll
-; CHECK-SHIFT2: clr {{%r[0-9]+}}, %r3
+; CHECK-SHIFT2: clrjle {{%r[0-9]+}}, %r3,
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: br %r14
@@ -128,8 +127,7 @@ define i16 @f4(i16 *%src, i16 %b) {
 ; CHECK: l [[OLD:%r[0-9]+]], 0(%r2)
 ; CHECK: [[LOOP:\.[^:]*]]:
 ; CHECK: rll [[ROT:%r[0-9]+]], [[OLD]], 0([[SHIFT]])
-; CHECK: clr [[ROT]], %r3
-; CHECK: jhe [[KEEP:\..*]]
+; CHECK: clrjhe [[ROT]], %r3, [[KEEP:\..*]]
 ; CHECK: risbg [[ROT]], %r3, 32, 47, 0
 ; CHECK: [[KEEP]]:
 ; CHECK: rll [[NEW:%r[0-9]+]], [[ROT]], 0({{%r[1-9]+}})
@@ -149,7 +147,7 @@ define i16 @f4(i16 *%src, i16 %b) {
 ; CHECK-SHIFT2-LABEL: f4:
 ; CHECK-SHIFT2: sll %r3, 16
 ; CHECK-SHIFT2: rll
-; CHECK-SHIFT2: clr {{%r[0-9]+}}, %r3
+; CHECK-SHIFT2: clrjhe {{%r[0-9]+}}, %r3,
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: rll
 ; CHECK-SHIFT2: br %r14
@@ -196,7 +194,7 @@ define i16 @f6(i16 *%src) {
 define i16 @f7(i16 *%src) {
 ; CHECK-LABEL: f7:
 ; CHECK: llilh [[SRC2:%r[0-9]+]], 1
-; CHECK: clr [[ROT:%r[0-9]+]], [[SRC2]]
+; CHECK: clrjle [[ROT:%r[0-9]+]], [[SRC2]],
 ; CHECK: risbg [[ROT]], [[SRC2]], 32, 47, 0
 ; CHECK: br %r14
 ;
@@ -213,7 +211,7 @@ define i16 @f7(i16 *%src) {
 define i16 @f8(i16 *%src) {
 ; CHECK-LABEL: f8:
 ; CHECK: llilh [[SRC2:%r[0-9]+]], 65534
-; CHECK: clr [[ROT:%r[0-9]+]], [[SRC2]]
+; CHECK: clrjhe [[ROT:%r[0-9]+]], [[SRC2]],
 ; CHECK: risbg [[ROT]], [[SRC2]], 32, 47, 0
 ; CHECK: br %r14
 ;
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll
index 3a9485a..f2152c6 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-03.ll
@@ -1,6 +1,7 @@
-; Test 32-bit atomic minimum and maximum.
+; Test 32-bit atomic minimum and maximum.  Here we match the z10 versions,
+; which can't use LOCR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Check signed minium.
 define i32 @f1(i32 %dummy, i32 *%src, i32 %b) {
@@ -37,9 +38,8 @@ define i32 @f3(i32 %dummy, i32 *%src, i32 %b) {
 ; CHECK-LABEL: f3:
 ; CHECK: l %r2, 0(%r3)
 ; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: clr %r2, %r4
 ; CHECK: lr [[NEW:%r[0-9]+]], %r2
-; CHECK: jle [[KEEP:\..*]]
+; CHECK: clrjle %r2, %r4, [[KEEP:\..*]]
 ; CHECK: lr [[NEW]], %r4
 ; CHECK: cs %r2, [[NEW]], 0(%r3)
 ; CHECK: jl [[LOOP]]
@@ -53,9 +53,8 @@ define i32 @f4(i32 %dummy, i32 *%src, i32 %b) {
 ; CHECK-LABEL: f4:
 ; CHECK: l %r2, 0(%r3)
 ; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: clr %r2, %r4
 ; CHECK: lr [[NEW:%r[0-9]+]], %r2
-; CHECK: jhe [[KEEP:\..*]]
+; CHECK: clrjhe %r2, %r4, [[KEEP:\..*]]
 ; CHECK: lr [[NEW]], %r4
 ; CHECK: cs %r2, [[NEW]], 0(%r3)
 ; CHECK: jl [[LOOP]]
diff --git a/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll b/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll
index ebed147..037eb1a 100644
--- a/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll
+++ b/test/CodeGen/SystemZ/atomicrmw-minmax-04.ll
@@ -1,6 +1,7 @@
-; Test 64-bit atomic minimum and maximum.
+; Test 64-bit atomic minimum and maximum.  Here we match the z10 versions,
+; which can't use LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Check signed minium.
 define i64 @f1(i64 %dummy, i64 *%src, i64 %b) {
@@ -37,9 +38,8 @@ define i64 @f3(i64 %dummy, i64 *%src, i64 %b) {
 ; CHECK-LABEL: f3:
 ; CHECK: lg %r2, 0(%r3)
 ; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: clgr %r2, %r4
 ; CHECK: lgr [[NEW:%r[0-9]+]], %r2
-; CHECK: jle [[KEEP:\..*]]
+; CHECK: clgrjle %r2, %r4, [[KEEP:\..*]]
 ; CHECK: lgr [[NEW]], %r4
 ; CHECK: csg %r2, [[NEW]], 0(%r3)
 ; CHECK: jl [[LOOP]]
@@ -53,9 +53,8 @@ define i64 @f4(i64 %dummy, i64 *%src, i64 %b) {
 ; CHECK-LABEL: f4:
 ; CHECK: lg %r2, 0(%r3)
 ; CHECK: [[LOOP:\.[^:]*]]:
-; CHECK: clgr %r2, %r4
 ; CHECK: lgr [[NEW:%r[0-9]+]], %r2
-; CHECK: jhe [[KEEP:\..*]]
+; CHECK: clgrjhe %r2, %r4, [[KEEP:\..*]]
 ; CHECK: lgr [[NEW]], %r4
 ; CHECK: csg %r2, [[NEW]], 0(%r3)
 ; CHECK: jl [[LOOP]]
diff --git a/test/CodeGen/SystemZ/branch-05.ll b/test/CodeGen/SystemZ/branch-05.ll
index d657c9b..b2157b5 100644
--- a/test/CodeGen/SystemZ/branch-05.ll
+++ b/test/CodeGen/SystemZ/branch-05.ll
@@ -5,8 +5,7 @@
 define i32 @f1(i32 %x, i32 %y, i32 %op) {
 ; CHECK-LABEL: f1:
 ; CHECK: ahi %r4, -1
-; CHECK: clfi %r4, 5
-; CHECK-NEXT: jh
+; CHECK: clijh %r4, 5,
 ; CHECK: llgfr [[OP64:%r[0-5]]], %r4
 ; CHECK: sllg [[INDEX:%r[1-5]]], [[OP64]], 3
 ; CHECK: larl [[BASE:%r[1-5]]]
diff --git a/test/CodeGen/SystemZ/branch-06.ll b/test/CodeGen/SystemZ/branch-06.ll
index 13e5a84..2fa23b7 100644
--- a/test/CodeGen/SystemZ/branch-06.ll
+++ b/test/CodeGen/SystemZ/branch-06.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
 declare i32 @foo()
+@g1 = global i16 0
 
 define void @f1(i32 %target) {
 ; CHECK-LABEL: f1:
@@ -87,3 +88,103 @@ loop:
 exit:
   ret void
 }
+
+; Check that CRJ is used for checking equality with a zero-extending
+; character load.
+define void @f7(i8 *%targetptr) {
+; CHECK-LABEL: f7:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: llc [[REG:%r[0-5]]],
+; CHECK: crje %r2, [[REG]], .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %byte = load i8 *%targetptr
+  %target = zext i8 %byte to i32
+  %cond = icmp eq i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+; ...and zero-extending i16 loads.
+define void @f8(i16 *%targetptr) {
+; CHECK-LABEL: f8:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: llh [[REG:%r[0-5]]],
+; CHECK: crje %r2, [[REG]], .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %half = load i16 *%targetptr
+  %target = zext i16 %half to i32
+  %cond = icmp eq i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+; ...unless the address is a global.
+define void @f9(i16 *%targetptr) {
+; CHECK-LABEL: f9:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clhrl %r2, g1
+; CHECK: je .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %half = load i16 *@g1
+  %target = zext i16 %half to i32
+  %cond = icmp eq i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+; Check that CRJ is used for checking order between two zero-extending
+; byte loads, even if the original comparison was unsigned.
+define void @f10(i8 *%targetptr1) {
+; CHECK-LABEL: f10:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK-DAG: llc [[REG1:%r[0-5]]], 0(
+; CHECK-DAG: llc [[REG2:%r[0-5]]], 1(
+; CHECK: crjl [[REG1]], [[REG2]], .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %targetptr2 = getelementptr i8 *%targetptr1, i64 1
+  %byte1 = load i8 *%targetptr1
+  %byte2 = load i8 *%targetptr2
+  %ext1 = zext i8 %byte1 to i32
+  %ext2 = zext i8 %byte2 to i32
+  %cond = icmp ult i32 %ext1, %ext2
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+; ...likewise halfword loads.
+define void @f11(i16 *%targetptr1) {
+; CHECK-LABEL: f11:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK-DAG: llh [[REG1:%r[0-5]]], 0(
+; CHECK-DAG: llh [[REG2:%r[0-5]]], 2(
+; CHECK: crjl [[REG1]], [[REG2]], .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %targetptr2 = getelementptr i16 *%targetptr1, i64 1
+  %half1 = load i16 *%targetptr1
+  %half2 = load i16 *%targetptr2
+  %ext1 = zext i16 %half1 to i32
+  %ext2 = zext i16 %half2 to i32
+  %cond = icmp ult i32 %ext1, %ext2
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/branch-07.ll b/test/CodeGen/SystemZ/branch-07.ll
index b715a05..bac6071 100644
--- a/test/CodeGen/SystemZ/branch-07.ll
+++ b/test/CodeGen/SystemZ/branch-07.ll
@@ -97,10 +97,9 @@ exit:
 ; Test a vector of 0/-1 results for i32 EQ.
 define i64 @f7(i64 %a, i64 %b) {
 ; CHECK-LABEL: f7:
-; CHECK: lhi [[REG:%r[0-5]]], -1
-; CHECK: crje {{%r[0-5]}}
-; CHECK: lhi [[REG]], 0
-; CHECK-NOT: sra
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: afi [[REG]], -268435456
+; CHECK: sra [[REG]], 31
 ; CHECK: br %r14
   %avec = bitcast i64 %a to <2 x i32>
   %bvec = bitcast i64 %b to <2 x i32>
@@ -113,10 +112,9 @@ define i64 @f7(i64 %a, i64 %b) {
 ; Test a vector of 0/-1 results for i32 NE.
 define i64 @f8(i64 %a, i64 %b) {
 ; CHECK-LABEL: f8:
-; CHECK: lhi [[REG:%r[0-5]]], -1
-; CHECK: crjlh {{%r[0-5]}}
-; CHECK: lhi [[REG]], 0
-; CHECK-NOT: sra
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: afi [[REG]], 1879048192
+; CHECK: sra [[REG]], 31
 ; CHECK: br %r14
   %avec = bitcast i64 %a to <2 x i32>
   %bvec = bitcast i64 %b to <2 x i32>
@@ -129,10 +127,10 @@ define i64 @f8(i64 %a, i64 %b) {
 ; Test a vector of 0/-1 results for i64 EQ.
 define void @f9(i64 %a, i64 %b, <2 x i64> *%dest) {
 ; CHECK-LABEL: f9:
-; CHECK: lghi [[REG:%r[0-5]]], -1
-; CHECK: crje {{%r[0-5]}}
-; CHECK: lghi [[REG]], 0
-; CHECK-NOT: sra
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: afi [[REG]], -268435456
+; CHECK: sllg [[REG2:%r[0-5]]], [[REG]], 32
+; CHECK: srag {{%r[0-5]}}, [[REG2]], 63
 ; CHECK: br %r14
   %avec = bitcast i64 %a to <2 x i32>
   %bvec = bitcast i64 %b to <2 x i32>
@@ -145,10 +143,10 @@ define void @f9(i64 %a, i64 %b, <2 x i64> *%dest) {
 ; Test a vector of 0/-1 results for i64 NE.
 define void @f10(i64 %a, i64 %b, <2 x i64> *%dest) {
 ; CHECK-LABEL: f10:
-; CHECK: lghi [[REG:%r[0-5]]], -1
-; CHECK: crjlh {{%r[0-5]}}
-; CHECK: lghi [[REG]], 0
-; CHECK-NOT: sra
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: afi [[REG]], 1879048192
+; CHECK: sllg [[REG2:%r[0-5]]], [[REG]], 32
+; CHECK: srag {{%r[0-5]}}, [[REG2]], 63
 ; CHECK: br %r14
   %avec = bitcast i64 %a to <2 x i32>
   %bvec = bitcast i64 %b to <2 x i32>
diff --git a/test/CodeGen/SystemZ/branch-08.ll b/test/CodeGen/SystemZ/branch-08.ll
index c4dc467..6741d29 100644
--- a/test/CodeGen/SystemZ/branch-08.ll
+++ b/test/CodeGen/SystemZ/branch-08.ll
@@ -6,14 +6,15 @@ declare void @foo() noreturn
 
 ; Check a case where a separate branch is needed and where the original
 ; order should be reversed.
-define i32 @f1(i32 %a, i32 %b) {
+define i32 @f1(i32 %a, i32 *%bptr) {
 ; CHECK-LABEL: f1:
-; CHECK: clr %r2, %r3
+; CHECK: cl %r2, 0(%r3)
 ; CHECK: jl .L[[LABEL:.*]]
 ; CHECK: br %r14
 ; CHECK: .L[[LABEL]]:
 ; CHECK: brasl %r14, foo@PLT
 entry:
+  %b = load i32 *%bptr
   %cmp = icmp ult i32 %a, %b
   br i1 %cmp, label %callit, label %return
 
diff --git a/test/CodeGen/SystemZ/branch-09.ll b/test/CodeGen/SystemZ/branch-09.ll
new file mode 100644
index 0000000..5591f5b
--- /dev/null
+++ b/test/CodeGen/SystemZ/branch-09.ll
@@ -0,0 +1,62 @@
+; Test all condition-code masks that are relevant for CLRJ.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i32 @foo()
+@g1 = global i16 0
+
+define void @f1(i32 %target) {
+; CHECK-LABEL: f1:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clrjle %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %cond = icmp ule i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f2(i32 %target) {
+; CHECK-LABEL: f2:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clrjl %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %cond = icmp ult i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f3(i32 %target) {
+; CHECK-LABEL: f3:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clrjh %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %cond = icmp ugt i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f4(i32 %target) {
+; CHECK-LABEL: f4:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clrjhe %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i32 @foo()
+  %cond = icmp uge i32 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/branch-10.ll b/test/CodeGen/SystemZ/branch-10.ll
new file mode 100644
index 0000000..ec6e759
--- /dev/null
+++ b/test/CodeGen/SystemZ/branch-10.ll
@@ -0,0 +1,62 @@
+; Test all condition-code masks that are relevant for CLGRJ.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i64 @foo()
+@g1 = global i16 0
+
+define void @f1(i64 %target) {
+; CHECK-LABEL: f1:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clgrjle %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i64 @foo()
+  %cond = icmp ule i64 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f2(i64 %target) {
+; CHECK-LABEL: f2:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clgrjl %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i64 @foo()
+  %cond = icmp ult i64 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f3(i64 %target) {
+; CHECK-LABEL: f3:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clgrjh %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i64 @foo()
+  %cond = icmp ugt i64 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
+
+define void @f4(i64 %target) {
+; CHECK-LABEL: f4:
+; CHECK: .cfi_def_cfa_offset
+; CHECK: .L[[LABEL:.*]]:
+; CHECK: clgrjhe %r2, {{%r[0-9]+}}, .L[[LABEL]]
+  br label %loop
+loop:
+  %val = call i64 @foo()
+  %cond = icmp uge i64 %val, %target
+  br i1 %cond, label %loop, label %exit
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/call-03.ll b/test/CodeGen/SystemZ/call-03.ll
new file mode 100644
index 0000000..1f314ea
--- /dev/null
+++ b/test/CodeGen/SystemZ/call-03.ll
@@ -0,0 +1,125 @@
+; Test sibling calls.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @ok(i8 %r2, i16 %r3, i32 %r4, i64 %r5, float %f0, double %f2,
+                 float %f4, double %f6)
+declare void @uses_r6(i8 %r2, i16 %r3, i32 %r4, i64 %r5, i64 %r6)
+declare void @uses_indirect(fp128 %r2)
+declare void @uses_stack(float %f0, float %f2, float %f4, float %f6,
+                         float %stack)
+declare i32 @returns_i32()
+declare i64 @returns_i64()
+
+; Check the maximum number of arguments that we can pass and still use
+; a sibling call.
+define void @f1() {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lzer %f0
+; CHECK-DAG: lzdr %f2
+; CHECK-DAG: lhi %r2, 1
+; CHECK-DAG: lhi %r3, 2
+; CHECK-DAG: lhi %r4, 3
+; CHECK-DAG: lghi %r5, 4
+; CHECK-DAG: {{ler %f4, %f0|lzer %f4}}
+; CHECK-DAG: {{ldr %f6, %f2|lzdr %f6}}
+; CHECK: jg ok@PLT
+  tail call void @ok(i8 1, i16 2, i32 3, i64 4, float 0.0, double 0.0,
+                     float 0.0, double 0.0)
+  ret void
+}
+
+; Check a call that uses %r6 to pass an argument.  At the moment we don't
+; use sibling calls in that case.
+define void @f2() {
+; CHECK-LABEL: f2:
+; CHECK: brasl %r14, uses_r6@PLT
+; CHECK: br %r14
+  tail call void @uses_r6(i8 1, i16 2, i32 3, i64 4, i64 5)
+  ret void
+}
+
+; Check a call that passes indirect arguments.  We can't use sibling
+; calls in that case.
+define void @f3() {
+; CHECK-LABEL: f3:
+; CHECK: brasl %r14, uses_indirect@PLT
+; CHECK: br %r14
+  tail call void @uses_indirect(fp128 0xL00000000000000000000000000000000)
+  ret void
+}
+
+; Check a call that uses direct stack arguments, which again prevents
+; sibling calls
+define void @f4() {
+; CHECK-LABEL: f4:
+; CHECK: brasl %r14, uses_stack@PLT
+; CHECK: br %r14
+  tail call void @uses_stack(float 0.0, float 0.0, float 0.0, float 0.0,
+                             float 0.0)
+  ret void
+}
+
+; Check an indirect call.  In this case the only acceptable choice for
+; the target register is %r1.
+define void @f5(void(i32, i32, i32, i32) *%foo) {
+; CHECK-LABEL: f5:
+; CHECK: lgr %r1, %r2
+; CHECK-DAG: lhi %r2, 1
+; CHECK-DAG: lhi %r3, 2
+; CHECK-DAG: lhi %r4, 3
+; CHECK-DAG: lhi %r5, 4
+; CHECK: br %r1
+  tail call void %foo(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Check an indirect call that will be forced into a call-saved GPR
+; (which should be %r13, the highest GPR not used for anything else).
+define void @f6(void(i32) *%foo) {
+; CHECK-LABEL: f6:
+; CHECK: stmg %r13, %r15, 104(%r15)
+; CHECK: lgr %r13, %r2
+; CHECK: brasl %r14, returns_i32
+; CHECK: lgr %r1, %r13
+; CHECK: lmg %r13, %r15, 264(%r15)
+; CHECK: br %r1
+  %arg = call i32 @returns_i32()
+  tail call void %foo(i32 %arg)
+  ret void
+}
+
+; Test a function that returns a value.
+define i64 @f7() {
+; CHECK-LABEL: f7:
+; CHECK: jg returns_i64@PLT
+  %res = tail call i64 @returns_i64()
+  ret i64 %res
+}
+
+; Test a function that returns a value truncated from i64 to i32.
+define i32 @f8() {
+; CHECK-LABEL: f8:
+; CHECK: jg returns_i64@PLT
+  %res = tail call i64 @returns_i64()
+  %trunc = trunc i64 %res to i32
+  ret i32 %trunc
+}
+
+; Test a function that returns a value truncated from i64 to i7.
+define i7 @f9() {
+; CHECK-LABEL: f9:
+; CHECK: jg returns_i64@PLT
+  %res = tail call i64 @returns_i64()
+  %trunc = trunc i64 %res to i7
+  ret i7 %trunc
+}
+
+; Test a function that returns a value truncated from i32 to i8.
+define i8 @f10() {
+; CHECK-LABEL: f10:
+; CHECK: jg returns_i32@PLT
+  %res = tail call i32 @returns_i32()
+  %trunc = trunc i32 %res to i8
+  ret i8 %trunc
+}
diff --git a/test/CodeGen/SystemZ/cond-store-01.ll b/test/CodeGen/SystemZ/cond-store-01.ll
index 80e6d91..d55ea21 100644
--- a/test/CodeGen/SystemZ/cond-store-01.ll
+++ b/test/CodeGen/SystemZ/cond-store-01.ll
@@ -1,6 +1,7 @@
-; Test 8-bit conditional stores that are presented as selects.
+; Test 8-bit conditional stores that are presented as selects.  The volatile
+; tests require z10, which use a branch instead of a LOCR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare void @foo(i8 *)
 
@@ -13,7 +14,7 @@ define void @f1(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -29,7 +30,7 @@ define void @f2(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %alt, i8 %orig
   store i8 %res, i8 *%ptr
@@ -46,7 +47,7 @@ define void @f3(i8 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = sext i8 %orig to i32
   %res = select i1 %cond, i32 %ext, i32 %alt
@@ -64,7 +65,7 @@ define void @f4(i8 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = sext i8 %orig to i32
   %res = select i1 %cond, i32 %alt, i32 %ext
@@ -83,7 +84,7 @@ define void @f5(i8 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = zext i8 %orig to i32
   %res = select i1 %cond, i32 %ext, i32 %alt
@@ -101,7 +102,7 @@ define void @f6(i8 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = zext i8 %orig to i32
   %res = select i1 %cond, i32 %alt, i32 %ext
@@ -120,7 +121,7 @@ define void @f7(i8 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = sext i8 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -138,7 +139,7 @@ define void @f8(i8 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = sext i8 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -157,7 +158,7 @@ define void @f9(i8 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = zext i8 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -175,7 +176,7 @@ define void @f10(i8 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %ext = zext i8 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -194,7 +195,7 @@ define void @f11(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 4095
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -211,7 +212,7 @@ define void @f12(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 4096
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -228,7 +229,7 @@ define void @f13(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 524287
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -247,7 +248,7 @@ define void @f14(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 524288
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -264,7 +265,7 @@ define void @f15(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 -524288
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -283,7 +284,7 @@ define void @f16(i8 *%base, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i8 *%base, i64 -524289
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -302,7 +303,7 @@ define void @f17(i64 %base, i64 %index, i8 %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 4096
   %ptr = inttoptr i64 %add2 to i8 *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -317,7 +318,7 @@ define void @f18(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stc {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -332,7 +333,7 @@ define void @f19(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stc %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store volatile i8 %res, i8 *%ptr
@@ -352,7 +353,7 @@ define void @f20(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stc {{%r[0-9]+}},
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load atomic i8 *%ptr unordered, align 1
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
@@ -368,7 +369,7 @@ define void @f21(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: cs {{%r[0-9]+}},
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store atomic i8 %res, i8 *%ptr unordered, align 1
@@ -388,7 +389,7 @@ define void @f22(i8 %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca i8
   call void @foo(i8 *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i8 *%ptr
   %res = select i1 %cond, i8 %orig, i8 %alt
   store i8 %res, i8 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-02.ll b/test/CodeGen/SystemZ/cond-store-02.ll
index e01a853..91bc486 100644
--- a/test/CodeGen/SystemZ/cond-store-02.ll
+++ b/test/CodeGen/SystemZ/cond-store-02.ll
@@ -1,6 +1,7 @@
-; Test 16-bit conditional stores that are presented as selects.
+; Test 16-bit conditional stores that are presented as selects.  The volatile
+; tests require z10, which use a branch instead of a LOCR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare void @foo(i16 *)
 
@@ -13,7 +14,7 @@ define void @f1(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -29,7 +30,7 @@ define void @f2(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %alt, i16 %orig
   store i16 %res, i16 *%ptr
@@ -46,7 +47,7 @@ define void @f3(i16 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = sext i16 %orig to i32
   %res = select i1 %cond, i32 %ext, i32 %alt
@@ -64,7 +65,7 @@ define void @f4(i16 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = sext i16 %orig to i32
   %res = select i1 %cond, i32 %alt, i32 %ext
@@ -83,7 +84,7 @@ define void @f5(i16 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = zext i16 %orig to i32
   %res = select i1 %cond, i32 %ext, i32 %alt
@@ -101,7 +102,7 @@ define void @f6(i16 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = zext i16 %orig to i32
   %res = select i1 %cond, i32 %alt, i32 %ext
@@ -120,7 +121,7 @@ define void @f7(i16 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = sext i16 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -138,7 +139,7 @@ define void @f8(i16 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = sext i16 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -157,7 +158,7 @@ define void @f9(i16 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = zext i16 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -175,7 +176,7 @@ define void @f10(i16 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %ext = zext i16 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -194,7 +195,7 @@ define void @f11(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 2047
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -211,7 +212,7 @@ define void @f12(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 2048
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -228,7 +229,7 @@ define void @f13(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 262143
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -247,7 +248,7 @@ define void @f14(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 262144
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -264,7 +265,7 @@ define void @f15(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 -262144
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -283,7 +284,7 @@ define void @f16(i16 *%base, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%base, i64 -262145
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -302,7 +303,7 @@ define void @f17(i64 %base, i64 %index, i16 %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 4096
   %ptr = inttoptr i64 %add2 to i16 *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -317,7 +318,7 @@ define void @f18(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: sth {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -332,7 +333,7 @@ define void @f19(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: sth %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store volatile i16 %res, i16 *%ptr
@@ -352,7 +353,7 @@ define void @f20(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: sth {{%r[0-9]+}},
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load atomic i16 *%ptr unordered, align 2
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
@@ -368,7 +369,7 @@ define void @f21(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: cs {{%r[0-9]+}},
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store atomic i16 %res, i16 *%ptr unordered, align 2
@@ -388,7 +389,7 @@ define void @f22(i16 %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca i16
   call void @foo(i16 *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i16 *%ptr
   %res = select i1 %cond, i16 %orig, i16 %alt
   store i16 %res, i16 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-03.ll b/test/CodeGen/SystemZ/cond-store-03.ll
index e122bc2..d4fd48d 100644
--- a/test/CodeGen/SystemZ/cond-store-03.ll
+++ b/test/CodeGen/SystemZ/cond-store-03.ll
@@ -13,7 +13,7 @@ define void @f1(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -29,7 +29,7 @@ define void @f2(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %alt, i32 %orig
   store i32 %res, i32 *%ptr
@@ -46,7 +46,7 @@ define void @f3(i32 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %ext = sext i32 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -64,7 +64,7 @@ define void @f4(i32 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %ext = sext i32 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -83,7 +83,7 @@ define void @f5(i32 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %ext = zext i32 %orig to i64
   %res = select i1 %cond, i64 %ext, i64 %alt
@@ -101,7 +101,7 @@ define void @f6(i32 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %ext = zext i32 %orig to i64
   %res = select i1 %cond, i64 %alt, i64 %ext
@@ -120,7 +120,7 @@ define void @f7(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 1023
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -137,7 +137,7 @@ define void @f8(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 1024
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -154,7 +154,7 @@ define void @f9(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 131071
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -173,7 +173,7 @@ define void @f10(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 131072
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -190,7 +190,7 @@ define void @f11(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 -131072
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -209,7 +209,7 @@ define void @f12(i32 *%base, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%base, i64 -131073
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -228,7 +228,7 @@ define void @f13(i64 %base, i64 %index, i32 %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 4096
   %ptr = inttoptr i64 %add2 to i32 *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -243,7 +243,7 @@ define void @f14(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: st {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -258,7 +258,7 @@ define void @f15(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: st %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store volatile i32 %res, i32 *%ptr
@@ -277,7 +277,7 @@ define void @f16(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: st {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load atomic i32 *%ptr unordered, align 4
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
@@ -293,7 +293,7 @@ define void @f17(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: cs {{%r[0-5]}}, %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store atomic i32 %res, i32 *%ptr unordered, align 4
@@ -313,7 +313,7 @@ define void @f18(i32 %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca i32
   call void @foo(i32 *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i32 *%ptr
   %res = select i1 %cond, i32 %orig, i32 %alt
   store i32 %res, i32 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-04.ll b/test/CodeGen/SystemZ/cond-store-04.ll
index 4ed23a3..fc565c4 100644
--- a/test/CodeGen/SystemZ/cond-store-04.ll
+++ b/test/CodeGen/SystemZ/cond-store-04.ll
@@ -13,7 +13,7 @@ define void @f1(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stg %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -29,7 +29,7 @@ define void @f2(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: stg %r3, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %alt, i64 %orig
   store i64 %res, i64 *%ptr
@@ -46,7 +46,7 @@ define void @f3(i64 *%base, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%base, i64 65535
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -65,7 +65,7 @@ define void @f4(i64 *%base, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%base, i64 65536
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -82,7 +82,7 @@ define void @f5(i64 *%base, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%base, i64 -65536
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -101,7 +101,7 @@ define void @f6(i64 *%base, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%base, i64 -65537
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -120,7 +120,7 @@ define void @f7(i64 %base, i64 %index, i64 %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 524287
   %ptr = inttoptr i64 %add2 to i64 *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -135,7 +135,7 @@ define void @f8(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stg {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -150,7 +150,7 @@ define void @f9(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stg %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store volatile i64 %res, i64 *%ptr
@@ -169,7 +169,7 @@ define void @f10(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: stg {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load atomic i64 *%ptr unordered, align 8
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
@@ -185,7 +185,7 @@ define void @f11(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: csg {{%r[0-5]}}, %r3, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store atomic i64 %res, i64 *%ptr unordered, align 8
@@ -205,7 +205,7 @@ define void @f12(i64 %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca i64
   call void @foo(i64 *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load i64 *%ptr
   %res = select i1 %cond, i64 %orig, i64 %alt
   store i64 %res, i64 *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-05.ll b/test/CodeGen/SystemZ/cond-store-05.ll
index e41c8fe..f8056f7 100644
--- a/test/CodeGen/SystemZ/cond-store-05.ll
+++ b/test/CodeGen/SystemZ/cond-store-05.ll
@@ -13,7 +13,7 @@ define void @f1(float *%ptr, float %alt, i32 %limit) {
 ; CHECK: ste %f0, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -29,7 +29,7 @@ define void @f2(float *%ptr, float %alt, i32 %limit) {
 ; CHECK: ste %f0, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %alt, float %orig
   store float %res, float *%ptr
@@ -46,7 +46,7 @@ define void @f3(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 1023
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -63,7 +63,7 @@ define void @f4(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 1024
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -80,7 +80,7 @@ define void @f5(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 131071
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -99,7 +99,7 @@ define void @f6(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 131072
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -116,7 +116,7 @@ define void @f7(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 -131072
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -135,7 +135,7 @@ define void @f8(float *%base, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr float *%base, i64 -131073
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -154,7 +154,7 @@ define void @f9(i64 %base, i64 %index, float %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 4096
   %ptr = inttoptr i64 %add2 to float *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -169,7 +169,7 @@ define void @f10(float *%ptr, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: ste {{%f[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
@@ -184,7 +184,7 @@ define void @f11(float *%ptr, float %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: ste %f0, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store volatile float %res, float *%ptr
@@ -204,7 +204,7 @@ define void @f12(float %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca float
   call void @foo(float *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load float *%ptr
   %res = select i1 %cond, float %orig, float %alt
   store float %res, float *%ptr
diff --git a/test/CodeGen/SystemZ/cond-store-06.ll b/test/CodeGen/SystemZ/cond-store-06.ll
index 759a3e0..6668195 100644
--- a/test/CodeGen/SystemZ/cond-store-06.ll
+++ b/test/CodeGen/SystemZ/cond-store-06.ll
@@ -13,7 +13,7 @@ define void @f1(double *%ptr, double %alt, i32 %limit) {
 ; CHECK: std %f0, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -29,7 +29,7 @@ define void @f2(double *%ptr, double %alt, i32 %limit) {
 ; CHECK: std %f0, 0(%r2)
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %alt, double %orig
   store double %res, double *%ptr
@@ -46,7 +46,7 @@ define void @f3(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 511
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -63,7 +63,7 @@ define void @f4(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 512
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -80,7 +80,7 @@ define void @f5(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 65535
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -99,7 +99,7 @@ define void @f6(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 65536
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -116,7 +116,7 @@ define void @f7(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 -65536
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -135,7 +135,7 @@ define void @f8(double *%base, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: br %r14
   %ptr = getelementptr double *%base, i64 -65537
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -154,7 +154,7 @@ define void @f9(i64 %base, i64 %index, double %alt, i32 %limit) {
   %add1 = add i64 %base, %index
   %add2 = add i64 %add1, 524287
   %ptr = inttoptr i64 %add2 to double *
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -169,7 +169,7 @@ define void @f10(double *%ptr, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: std {{%f[0-5]}}, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load volatile double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
@@ -184,7 +184,7 @@ define void @f11(double *%ptr, double %alt, i32 %limit) {
 ; CHECK: [[LABEL]]:
 ; CHECK: std %f0, 0(%r2)
 ; CHECK: br %r14
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store volatile double %res, double *%ptr
@@ -204,7 +204,7 @@ define void @f12(double %alt, i32 %limit) {
 ; CHECK: br %r14
   %ptr = alloca double
   call void @foo(double *%ptr)
-  %cond = icmp ult i32 %limit, 42
+  %cond = icmp ult i32 %limit, 420
   %orig = load double *%ptr
   %res = select i1 %cond, double %orig, double %alt
   store double %res, double *%ptr
diff --git a/test/CodeGen/SystemZ/fp-cmp-01.ll b/test/CodeGen/SystemZ/fp-cmp-01.ll
index 6a9598e..d7c0cce 100644
--- a/test/CodeGen/SystemZ/fp-cmp-01.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-01.ll
@@ -1,6 +1,7 @@
-; Test 32-bit floating-point comparison.
+; Test 32-bit floating-point comparison.  The tests assume a z10 implementation
+; of select, using conditional branches rather than LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare float @foo()
 
@@ -159,3 +160,160 @@ define i64 @f8(i64 %a, i64 %b, float %f) {
   %res = select i1 %cond, i64 %a, i64 %b
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CEB to be used,
+; first with oeq.
+define i64 @f9(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: je {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp oeq float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then one.
+define i64 @f10(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f10:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jlh {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp one float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then olt.
+define i64 @f11(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f11:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp olt float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ole.
+define i64 @f12(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f12:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jhe {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ole float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then oge.
+define i64 @f13(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f13:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jle {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp oge float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ogt.
+define i64 @f14(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f14:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jl {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ogt float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ueq.
+define i64 @f15(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f15:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jnlh {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ueq float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then une.
+define i64 @f16(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f16:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jne {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp une float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ult.
+define i64 @f17(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f17:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jnle {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ult float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ule.
+define i64 @f18(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f18:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jnl {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ule float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then uge.
+define i64 @f19(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f19:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jnh {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp uge float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
+
+; ...then ugt.
+define i64 @f20(i64 %a, i64 %b, float %f2, float *%ptr) {
+; CHECK-LABEL: f20:
+; CHECK: ceb %f0, 0(%r4)
+; CHECK-NEXT: jnhe {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load float *%ptr
+  %cond = fcmp ugt float %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/fp-cmp-02.ll b/test/CodeGen/SystemZ/fp-cmp-02.ll
index 309d12e..c61f04e 100644
--- a/test/CodeGen/SystemZ/fp-cmp-02.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-02.ll
@@ -1,6 +1,7 @@
-; Test 64-bit floating-point comparison.
+; Test 64-bit floating-point comparison.  The tests assume a z10 implementation
+; of select, using conditional branches rather than LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare double @foo()
 
@@ -159,3 +160,16 @@ define i64 @f8(i64 %a, i64 %b, double %f) {
   %res = select i1 %cond, i64 %a, i64 %b
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CDB to be used,
+define i64 @f9(i64 %a, i64 %b, double %f2, double *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: cdb %f0, 0(%r4)
+; CHECK-NEXT: jl {{\.L.*}}
+; CHECK: lgr %r2, %r3
+; CHECK: br %r14
+  %f1 = load double *%ptr
+  %cond = fcmp ogt double %f1, %f2
+  %res = select i1 %cond, i64 %a, i64 %b
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/fp-cmp-03.ll b/test/CodeGen/SystemZ/fp-cmp-03.ll
index 0f71f4e..e777d00 100644
--- a/test/CodeGen/SystemZ/fp-cmp-03.ll
+++ b/test/CodeGen/SystemZ/fp-cmp-03.ll
@@ -1,6 +1,7 @@
-; Test 128-bit floating-point comparison.
+; Test 128-bit floating-point comparison.  The tests assume a z10 implementation
+; of select, using conditional branches rather than LOCGR.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; There is no memory form of 128-bit comparison.
 define i64 @f1(i64 %a, i64 %b, fp128 *%ptr, float %f2) {
diff --git a/test/CodeGen/SystemZ/fp-move-02.ll b/test/CodeGen/SystemZ/fp-move-02.ll
index b4f0428..505ee8d 100644
--- a/test/CodeGen/SystemZ/fp-move-02.ll
+++ b/test/CodeGen/SystemZ/fp-move-02.ll
@@ -1,6 +1,7 @@
-; Test moves between FPRs and GPRs.
+; Test moves between FPRs and GPRs.  The 32-bit cases test the z10
+; implementation, which has no high-word support.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare i64 @foo()
 declare double @bar()
@@ -63,11 +64,11 @@ define double @f5(i64 %a) {
 
 ; Test 128-bit moves from GPRs to FPRs.  i128 isn't a legitimate type,
 ; so this goes through memory.
-; FIXME: it would be better to use one MVC here.
 define void @f6(fp128 *%a, i128 *%b) {
 ; CHECK-LABEL: f6:
 ; CHECK: lg
-; CHECK: mvc
+; CHECK: lg
+; CHECK: stg
 ; CHECK: stg
 ; CHECK: br %r14
   %val = load i128 *%b
diff --git a/test/CodeGen/SystemZ/fp-move-09.ll b/test/CodeGen/SystemZ/fp-move-09.ll
new file mode 100644
index 0000000..52b2ee2
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-move-09.ll
@@ -0,0 +1,62 @@
+; Test moves between FPRs and GPRs for z196 and above.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Check that moves from i32s to floats can use high registers.
+define float @f1(i16 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: llhh [[REG:%r[0-5]]], 0(%r2)
+; CHECK: oihh [[REG]], 16256
+; CHECK: ldgr %f0, [[REG]]
+; CHECK: br %r14
+  %base = load i16 *%ptr
+  %ext = zext i16 %base to i32
+  %full = or i32 %ext, 1065353216
+  %res = bitcast i32 %full to float
+  ret float %res
+}
+
+; Check that moves from floats to i32s can use high registers.
+; This "store the low byte" technique is used by llvmpipe, for example.
+define void @f2(float %val, i8 *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: lgdr [[REG:%r[0-5]]], %f0
+; CHECK: stch [[REG]], 0(%r2)
+; CHECK: br %r14
+  %res = bitcast float %val to i32
+  %trunc = trunc i32 %res to i8
+  store i8 %trunc, i8 *%ptr
+  ret void
+}
+
+; Like f2, but with a conditional store.
+define void @f3(float %val, i8 *%ptr, i32 %which) {
+; CHECK-LABEL: f3:
+; CHECK: cijlh %r3, 0,
+; CHECK: lgdr [[REG:%r[0-5]]], %f0
+; CHECK: stch [[REG]], 0(%r2)
+; CHECK: br %r14
+  %int = bitcast float %val to i32
+  %trunc = trunc i32 %int to i8
+  %old = load i8 *%ptr
+  %cmp = icmp eq i32 %which, 0
+  %res = select i1 %cmp, i8 %trunc, i8 %old
+  store i8 %res, i8 *%ptr
+  ret void
+}
+
+; ...and again with 16-bit memory.
+define void @f4(float %val, i16 *%ptr, i32 %which) {
+; CHECK-LABEL: f4:
+; CHECK: cijlh %r3, 0,
+; CHECK: lgdr [[REG:%r[0-5]]], %f0
+; CHECK: sthh [[REG]], 0(%r2)
+; CHECK: br %r14
+  %int = bitcast float %val to i32
+  %trunc = trunc i32 %int to i16
+  %old = load i16 *%ptr
+  %cmp = icmp eq i32 %which, 0
+  %res = select i1 %cmp, i16 %trunc, i16 %old
+  store i16 %res, i16 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-round-01.ll b/test/CodeGen/SystemZ/fp-round-01.ll
index f2530dc..565db5a 100644
--- a/test/CodeGen/SystemZ/fp-round-01.ll
+++ b/test/CodeGen/SystemZ/fp-round-01.ll
@@ -1,9 +1,8 @@
-; Test rint()-like rounding, with non-integer values triggering an
-; inexact condition.
+; Test rounding functions for z10.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
-; Test f32.
+; Test rint for f32.
 declare float @llvm.rint.f32(float %f)
 define float @f1(float %f) {
 ; CHECK-LABEL: f1:
@@ -13,7 +12,7 @@ define float @f1(float %f) {
   ret float %res
 }
 
-; Test f64.
+; Test rint for f64.
 declare double @llvm.rint.f64(double %f)
 define double @f2(double %f) {
 ; CHECK-LABEL: f2:
@@ -23,7 +22,7 @@ define double @f2(double %f) {
   ret double %res
 }
 
-; Test f128.
+; Test rint for f128.
 declare fp128 @llvm.rint.f128(fp128 %f)
 define void @f3(fp128 *%ptr) {
 ; CHECK-LABEL: f3:
@@ -34,3 +33,118 @@ define void @f3(fp128 *%ptr) {
   store fp128 %res, fp128 *%ptr
   ret void
 }
+
+; Test nearbyint for f32.
+declare float @llvm.nearbyint.f32(float %f)
+define float @f4(float %f) {
+; CHECK-LABEL: f4:
+; CHECK: brasl %r14, nearbyintf@PLT
+; CHECK: br %r14
+  %res = call float @llvm.nearbyint.f32(float %f)
+  ret float %res
+}
+
+; Test nearbyint for f64.
+declare double @llvm.nearbyint.f64(double %f)
+define double @f5(double %f) {
+; CHECK-LABEL: f5:
+; CHECK: brasl %r14, nearbyint@PLT
+; CHECK: br %r14
+  %res = call double @llvm.nearbyint.f64(double %f)
+  ret double %res
+}
+
+; Test nearbyint for f128: omitted for now because we cannot handle
+; indirect arguments.
+
+; Test floor for f32.
+declare float @llvm.floor.f32(float %f)
+define float @f7(float %f) {
+; CHECK-LABEL: f7:
+; CHECK: brasl %r14, floorf@PLT
+; CHECK: br %r14
+  %res = call float @llvm.floor.f32(float %f)
+  ret float %res
+}
+
+; Test floor for f64.
+declare double @llvm.floor.f64(double %f)
+define double @f8(double %f) {
+; CHECK-LABEL: f8:
+; CHECK: brasl %r14, floor@PLT
+; CHECK: br %r14
+  %res = call double @llvm.floor.f64(double %f)
+  ret double %res
+}
+
+; Test floor for f128: omitted for now because we cannot handle
+; indirect arguments.
+
+; Test ceil for f32.
+declare float @llvm.ceil.f32(float %f)
+define float @f10(float %f) {
+; CHECK-LABEL: f10:
+; CHECK: brasl %r14, ceilf@PLT
+; CHECK: br %r14
+  %res = call float @llvm.ceil.f32(float %f)
+  ret float %res
+}
+
+; Test ceil for f64.
+declare double @llvm.ceil.f64(double %f)
+define double @f11(double %f) {
+; CHECK-LABEL: f11:
+; CHECK: brasl %r14, ceil@PLT
+; CHECK: br %r14
+  %res = call double @llvm.ceil.f64(double %f)
+  ret double %res
+}
+
+; Test ceil for f128: omitted for now because we cannot handle
+; indirect arguments.
+
+; Test trunc for f32.
+declare float @llvm.trunc.f32(float %f)
+define float @f13(float %f) {
+; CHECK-LABEL: f13:
+; CHECK: brasl %r14, truncf@PLT
+; CHECK: br %r14
+  %res = call float @llvm.trunc.f32(float %f)
+  ret float %res
+}
+
+; Test trunc for f64.
+declare double @llvm.trunc.f64(double %f)
+define double @f14(double %f) {
+; CHECK-LABEL: f14:
+; CHECK: brasl %r14, trunc@PLT
+; CHECK: br %r14
+  %res = call double @llvm.trunc.f64(double %f)
+  ret double %res
+}
+
+; Test trunc for f128: omitted for now because we cannot handle
+; indirect arguments.
+
+; Test round for f32.
+declare float @llvm.round.f32(float %f)
+define float @f16(float %f) {
+; CHECK-LABEL: f16:
+; CHECK: brasl %r14, roundf@PLT
+; CHECK: br %r14
+  %res = call float @llvm.round.f32(float %f)
+  ret float %res
+}
+
+; Test round for f64.
+declare double @llvm.round.f64(double %f)
+define double @f17(double %f) {
+; CHECK-LABEL: f17:
+; CHECK: brasl %r14, round@PLT
+; CHECK: br %r14
+  %res = call double @llvm.round.f64(double %f)
+  ret double %res
+}
+
+; Test round for f128: omitted for now because we cannot handle
+; indirect arguments.
diff --git a/test/CodeGen/SystemZ/fp-round-02.ll b/test/CodeGen/SystemZ/fp-round-02.ll
new file mode 100644
index 0000000..d79c9c4
--- /dev/null
+++ b/test/CodeGen/SystemZ/fp-round-02.ll
@@ -0,0 +1,195 @@
+; Test rounding functions for z196 and above.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Test rint for f32.
+declare float @llvm.rint.f32(float %f)
+define float @f1(float %f) {
+; CHECK-LABEL: f1:
+; CHECK: fiebr %f0, 0, %f0
+; CHECK: br %r14
+  %res = call float @llvm.rint.f32(float %f)
+  ret float %res
+}
+
+; Test rint for f64.
+declare double @llvm.rint.f64(double %f)
+define double @f2(double %f) {
+; CHECK-LABEL: f2:
+; CHECK: fidbr %f0, 0, %f0
+; CHECK: br %r14
+  %res = call double @llvm.rint.f64(double %f)
+  ret double %res
+}
+
+; Test rint for f128.
+declare fp128 @llvm.rint.f128(fp128 %f)
+define void @f3(fp128 *%ptr) {
+; CHECK-LABEL: f3:
+; CHECK: fixbr %f0, 0, %f0
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.rint.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test nearbyint for f32.
+declare float @llvm.nearbyint.f32(float %f)
+define float @f4(float %f) {
+; CHECK-LABEL: f4:
+; CHECK: fiebra %f0, 0, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.nearbyint.f32(float %f)
+  ret float %res
+}
+
+; Test nearbyint for f64.
+declare double @llvm.nearbyint.f64(double %f)
+define double @f5(double %f) {
+; CHECK-LABEL: f5:
+; CHECK: fidbra %f0, 0, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.nearbyint.f64(double %f)
+  ret double %res
+}
+
+; Test nearbyint for f128.
+declare fp128 @llvm.nearbyint.f128(fp128 %f)
+define void @f6(fp128 *%ptr) {
+; CHECK-LABEL: f6:
+; CHECK: fixbra %f0, 0, %f0, 4
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.nearbyint.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test floor for f32.
+declare float @llvm.floor.f32(float %f)
+define float @f7(float %f) {
+; CHECK-LABEL: f7:
+; CHECK: fiebra %f0, 7, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.floor.f32(float %f)
+  ret float %res
+}
+
+; Test floor for f64.
+declare double @llvm.floor.f64(double %f)
+define double @f8(double %f) {
+; CHECK-LABEL: f8:
+; CHECK: fidbra %f0, 7, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.floor.f64(double %f)
+  ret double %res
+}
+
+; Test floor for f128.
+declare fp128 @llvm.floor.f128(fp128 %f)
+define void @f9(fp128 *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: fixbra %f0, 7, %f0, 4
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.floor.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test ceil for f32.
+declare float @llvm.ceil.f32(float %f)
+define float @f10(float %f) {
+; CHECK-LABEL: f10:
+; CHECK: fiebra %f0, 6, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.ceil.f32(float %f)
+  ret float %res
+}
+
+; Test ceil for f64.
+declare double @llvm.ceil.f64(double %f)
+define double @f11(double %f) {
+; CHECK-LABEL: f11:
+; CHECK: fidbra %f0, 6, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.ceil.f64(double %f)
+  ret double %res
+}
+
+; Test ceil for f128.
+declare fp128 @llvm.ceil.f128(fp128 %f)
+define void @f12(fp128 *%ptr) {
+; CHECK-LABEL: f12:
+; CHECK: fixbra %f0, 6, %f0, 4
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.ceil.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test trunc for f32.
+declare float @llvm.trunc.f32(float %f)
+define float @f13(float %f) {
+; CHECK-LABEL: f13:
+; CHECK: fiebra %f0, 5, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.trunc.f32(float %f)
+  ret float %res
+}
+
+; Test trunc for f64.
+declare double @llvm.trunc.f64(double %f)
+define double @f14(double %f) {
+; CHECK-LABEL: f14:
+; CHECK: fidbra %f0, 5, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.trunc.f64(double %f)
+  ret double %res
+}
+
+; Test trunc for f128.
+declare fp128 @llvm.trunc.f128(fp128 %f)
+define void @f15(fp128 *%ptr) {
+; CHECK-LABEL: f15:
+; CHECK: fixbra %f0, 5, %f0, 4
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.trunc.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
+
+; Test round for f32.
+declare float @llvm.round.f32(float %f)
+define float @f16(float %f) {
+; CHECK-LABEL: f16:
+; CHECK: fiebra %f0, 1, %f0, 4
+; CHECK: br %r14
+  %res = call float @llvm.round.f32(float %f)
+  ret float %res
+}
+
+; Test round for f64.
+declare double @llvm.round.f64(double %f)
+define double @f17(double %f) {
+; CHECK-LABEL: f17:
+; CHECK: fidbra %f0, 1, %f0, 4
+; CHECK: br %r14
+  %res = call double @llvm.round.f64(double %f)
+  ret double %res
+}
+
+; Test round for f128.
+declare fp128 @llvm.round.f128(fp128 %f)
+define void @f18(fp128 *%ptr) {
+; CHECK-LABEL: f18:
+; CHECK: fixbra %f0, 1, %f0, 4
+; CHECK: br %r14
+  %src = load fp128 *%ptr
+  %res = call fp128 @llvm.round.f128(fp128 %src)
+  store fp128 %res, fp128 *%ptr
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/fp-sqrt-01.ll b/test/CodeGen/SystemZ/fp-sqrt-01.ll
index b6568d6..7465af4 100644
--- a/test/CodeGen/SystemZ/fp-sqrt-01.ll
+++ b/test/CodeGen/SystemZ/fp-sqrt-01.ll
@@ -2,7 +2,8 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-declare float @llvm.sqrt.f32(float %f)
+declare float @llvm.sqrt.f32(float)
+declare float @sqrtf(float)
 
 ; Check register square root.
 define float @f1(float %val) {
@@ -152,3 +153,17 @@ define void @f7(float *%ptr) {
 
   ret void
 }
+
+; Check that a call to the normal sqrtf function is lowered.
+define float @f8(float %dummy, float %val) {
+; CHECK-LABEL: f8:
+; CHECK: sqebr %f0, %f2
+; CHECK: cebr %f0, %f0
+; CHECK: jo [[LABEL:\.L.*]]
+; CHECK: br %r14
+; CHECK: [[LABEL]]:
+; CHECK: ler %f0, %f2
+; CHECK: jg sqrtf@PLT
+  %res = tail call float @sqrtf(float %val)
+  ret float %res
+}
diff --git a/test/CodeGen/SystemZ/fp-sqrt-02.ll b/test/CodeGen/SystemZ/fp-sqrt-02.ll
index b07a2c6..66ffd19 100644
--- a/test/CodeGen/SystemZ/fp-sqrt-02.ll
+++ b/test/CodeGen/SystemZ/fp-sqrt-02.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
 declare double @llvm.sqrt.f64(double %f)
+declare double @sqrt(double)
 
 ; Check register square root.
 define double @f1(double %val) {
@@ -152,3 +153,17 @@ define void @f7(double *%ptr) {
 
   ret void
 }
+
+; Check that a call to the normal sqrt function is lowered.
+define double @f8(double %dummy, double %val) {
+; CHECK-LABEL: f8:
+; CHECK: sqdbr %f0, %f2
+; CHECK: cdbr %f0, %f0
+; CHECK: jo [[LABEL:\.L.*]]
+; CHECK: br %r14
+; CHECK: [[LABEL]]:
+; CHECK: ldr %f0, %f2
+; CHECK: jg sqrt@PLT
+  %res = tail call double @sqrt(double %val)
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/frame-13.ll b/test/CodeGen/SystemZ/frame-13.ll
index 1d38354..393850f 100644
--- a/test/CodeGen/SystemZ/frame-13.ll
+++ b/test/CodeGen/SystemZ/frame-13.ll
@@ -1,8 +1,11 @@
 ; Test the handling of base + 12-bit displacement addresses for large frames,
-; in cases where no 20-bit form exists.
+; in cases where no 20-bit form exists.  The tests here assume z10 register
+; pressure, without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \
+; RUN:   FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | \
+; RUN:   FileCheck -check-prefix=CHECK-FP %s
 
 ; This file tests what happens when a displacement is converted from
 ; being relative to the start of a frame object to being relative to
@@ -182,17 +185,16 @@ define void @f8() {
 }
 
 ; Check a case where the original displacement is out of range.  The backend
-; should force an LAY from the outset.  We don't yet do any kind of anchor
-; optimization, so there should be no offset on the MVHI itself.
+; should force STY to be used instead.
 define void @f9() {
 ; CHECK-NOFP-LABEL: f9:
-; CHECK-NOFP: lay %r1, 12296(%r15)
-; CHECK-NOFP: mvhi 0(%r1), 42
+; CHECK-NOFP: lhi [[TMP:%r[0-5]]], 42
+; CHECK-NOFP: sty [[TMP]], 12296(%r15)
 ; CHECK-NOFP: br %r14
 ;
 ; CHECK-FP-LABEL: f9:
-; CHECK-FP: lay %r1, 12296(%r11)
-; CHECK-FP: mvhi 0(%r1), 42
+; CHECK-FP: lhi [[TMP:%r[0-5]]], 42
+; CHECK-FP: sty [[TMP]], 12296(%r11)
 ; CHECK-FP: br %r14
   %region1 = alloca [2006 x i32], align 8
   %region2 = alloca [2006 x i32], align 8
diff --git a/test/CodeGen/SystemZ/frame-14.ll b/test/CodeGen/SystemZ/frame-14.ll
index 22a45ee..3b48179 100644
--- a/test/CodeGen/SystemZ/frame-14.ll
+++ b/test/CodeGen/SystemZ/frame-14.ll
@@ -1,9 +1,13 @@
 ; Test the handling of base + displacement addresses for large frames,
 ; in cases where both 12-bit and 20-bit displacements are allowed.
+; The tests here assume z10 register pressure, without the high words
+; being available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \
+; RUN:   FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | \
+; RUN:   FileCheck -check-prefix=CHECK-FP %s
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
-
 ; This file tests what happens when a displacement is converted from
 ; being relative to the start of a frame object to being relative to
 ; the frame itself.  In some cases the test is only possible if two
diff --git a/test/CodeGen/SystemZ/frame-15.ll b/test/CodeGen/SystemZ/frame-15.ll
index d8b291d..b3c95e7 100644
--- a/test/CodeGen/SystemZ/frame-15.ll
+++ b/test/CodeGen/SystemZ/frame-15.ll
@@ -1,8 +1,11 @@
 ; Test the handling of base + index + 12-bit displacement addresses for
-; large frames, in cases where no 20-bit form exists.
+; large frames, in cases where no 20-bit form exists.  The tests here
+; assume z10 register pressure, without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \
+; RUN:   FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | \
+; RUN:   FileCheck -check-prefix=CHECK-FP %s
 
 declare void @foo(float *%ptr1, float *%ptr2)
 
diff --git a/test/CodeGen/SystemZ/frame-16.ll b/test/CodeGen/SystemZ/frame-16.ll
index 9f43b49..f7e2dfa 100644
--- a/test/CodeGen/SystemZ/frame-16.ll
+++ b/test/CodeGen/SystemZ/frame-16.ll
@@ -1,8 +1,12 @@
 ; Test the handling of base + index + displacement addresses for large frames,
 ; in cases where both 12-bit and 20-bit displacements are allowed.
+; The tests here assume z10 register pressure, without the high words
+; being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck -check-prefix=CHECK-NOFP %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-fp-elim | FileCheck -check-prefix=CHECK-FP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \
+; RUN:   FileCheck -check-prefix=CHECK-NOFP %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-fp-elim | \
+; RUN:   FileCheck -check-prefix=CHECK-FP %s
 
 ; This file tests what happens when a displacement is converted from
 ; being relative to the start of a frame object to being relative to
diff --git a/test/CodeGen/SystemZ/frame-18.ll b/test/CodeGen/SystemZ/frame-18.ll
index 57d6f7d..21dfc12 100644
--- a/test/CodeGen/SystemZ/frame-18.ll
+++ b/test/CodeGen/SystemZ/frame-18.ll
@@ -1,6 +1,7 @@
-; Test spilling of GPRs.
+; Test spilling of GPRs.  The tests here assume z10 register pressure,
+; without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; We need to allocate a 4-byte spill slot, rounded to 8 bytes.  The frame
 ; size should be exactly 160 + 8 = 168.
diff --git a/test/CodeGen/SystemZ/insert-06.ll b/test/CodeGen/SystemZ/insert-06.ll
index 8366b2c..edcd0c5 100644
--- a/test/CodeGen/SystemZ/insert-06.ll
+++ b/test/CodeGen/SystemZ/insert-06.ll
@@ -165,3 +165,16 @@ define i64 @f13(i64 %a, i32 %b) {
   %or = or i64 %shift, %low
   ret i64 %or
 }
+
+; We previously wrongly removed the upper AND as dead.
+define i64 @f14(i64 %a, i64 %b) {
+; CHECK-LABEL: f14:
+; CHECK: risbg {{%r[0-5]}}, %r2, 6, 134, 0
+; CHECK: br %r14
+  %and1 = and i64 %a, 144115188075855872
+  %and2 = and i64 %b, 15
+  %or = or i64 %and1, %and2
+  %res = icmp eq i64 %or, 0
+  %ext = sext i1 %res to i64
+  ret i64 %ext
+}
diff --git a/test/CodeGen/SystemZ/int-abs-01.ll b/test/CodeGen/SystemZ/int-abs-01.ll
new file mode 100644
index 0000000..40fb611
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-abs-01.ll
@@ -0,0 +1,83 @@
+; Test integer absolute.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test i32->i32 absolute using slt.
+define i32 @f1(i32 %val) {
+; CHECK-LABEL: f1:
+; CHECK: lpr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp slt i32 %val, 0
+  %neg = sub i32 0, %val
+  %res = select i1 %cmp, i32 %neg, i32 %val
+  ret i32 %res
+}
+
+; Test i32->i32 absolute using sle.
+define i32 @f2(i32 %val) {
+; CHECK-LABEL: f2:
+; CHECK: lpr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sle i32 %val, 0
+  %neg = sub i32 0, %val
+  %res = select i1 %cmp, i32 %neg, i32 %val
+  ret i32 %res
+}
+
+; Test i32->i32 absolute using sgt.
+define i32 @f3(i32 %val) {
+; CHECK-LABEL: f3:
+; CHECK: lpr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sgt i32 %val, 0
+  %neg = sub i32 0, %val
+  %res = select i1 %cmp, i32 %val, i32 %neg
+  ret i32 %res
+}
+
+; Test i32->i32 absolute using sge.
+define i32 @f4(i32 %val) {
+; CHECK-LABEL: f4:
+; CHECK: lpr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sge i32 %val, 0
+  %neg = sub i32 0, %val
+  %res = select i1 %cmp, i32 %val, i32 %neg
+  ret i32 %res
+}
+
+; Test i32->i64 absolute.
+define i64 @f5(i32 %val) {
+; CHECK-LABEL: f5:
+; CHECK: lpgfr %r2, %r2
+; CHECK: br %r14
+  %ext = sext i32 %val to i64
+  %cmp = icmp slt i64 %ext, 0
+  %neg = sub i64 0, %ext
+  %res = select i1 %cmp, i64 %neg, i64 %ext
+  ret i64 %res
+}
+
+; Test i32->i64 absolute that uses an "in-register" form of sign extension.
+define i64 @f6(i64 %val) {
+; CHECK-LABEL: f6:
+; CHECK: lpgfr %r2, %r2
+; CHECK: br %r14
+  %trunc = trunc i64 %val to i32
+  %ext = sext i32 %trunc to i64
+  %cmp = icmp slt i64 %ext, 0
+  %neg = sub i64 0, %ext
+  %res = select i1 %cmp, i64 %neg, i64 %ext
+  ret i64 %res
+}
+
+; Test i64 absolute.
+define i64 @f7(i64 %val) {
+; CHECK-LABEL: f7:
+; CHECK: lpgr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp slt i64 %val, 0
+  %neg = sub i64 0, %val
+  %res = select i1 %cmp, i64 %neg, i64 %val
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-add-09.ll b/test/CodeGen/SystemZ/int-add-09.ll
index 717fed0..fd151a7 100644
--- a/test/CodeGen/SystemZ/int-add-09.ll
+++ b/test/CodeGen/SystemZ/int-add-09.ll
@@ -7,7 +7,7 @@
 define void @f1(i128 *%aptr) {
 ; CHECK-LABEL: f1:
 ; CHECK: algfi {{%r[0-5]}}, 1
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 128
@@ -20,7 +20,7 @@ define void @f1(i128 *%aptr) {
 define void @f2(i128 *%aptr) {
 ; CHECK-LABEL: f2:
 ; CHECK: algfi {{%r[0-5]}}, 4294967295
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 128
@@ -33,7 +33,7 @@ define void @f2(i128 *%aptr) {
 define void @f3(i128 *%aptr) {
 ; CHECK-LABEL: f3:
 ; CHECK: algr
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 128
@@ -46,7 +46,7 @@ define void @f3(i128 *%aptr) {
 define void @f4(i128 *%aptr) {
 ; CHECK-LABEL: f4:
 ; CHECK: algr
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 128
diff --git a/test/CodeGen/SystemZ/int-add-10.ll b/test/CodeGen/SystemZ/int-add-10.ll
index 66a275b..01d0a66 100644
--- a/test/CodeGen/SystemZ/int-add-10.ll
+++ b/test/CodeGen/SystemZ/int-add-10.ll
@@ -7,7 +7,7 @@
 define void @f1(i128 *%aptr, i32 %b) {
 ; CHECK-LABEL: f1:
 ; CHECK: algfr {{%r[0-5]}}, %r3
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -21,7 +21,7 @@ define void @f1(i128 *%aptr, i32 %b) {
 define void @f2(i128 *%aptr, i64 %b) {
 ; CHECK-LABEL: f2:
 ; CHECK: algfr {{%r[0-5]}}, %r3
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -37,7 +37,7 @@ define void @f2(i128 *%aptr, i64 %b) {
 define void @f3(i128 *%aptr, i64 %b) {
 ; CHECK-LABEL: f3:
 ; CHECK: algfr {{%r[0-5]}}, %r3
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -52,7 +52,7 @@ define void @f3(i128 *%aptr, i64 %b) {
 define void @f4(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f4:
 ; CHECK: algf {{%r[0-5]}}, 0(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -67,7 +67,7 @@ define void @f4(i128 *%aptr, i32 *%bsrc) {
 define void @f5(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f5:
 ; CHECK: algf {{%r[0-5]}}, 524284(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -85,7 +85,7 @@ define void @f6(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f6:
 ; CHECK: agfi %r3, 524288
 ; CHECK: algf {{%r[0-5]}}, 0(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -101,7 +101,7 @@ define void @f6(i128 *%aptr, i32 *%bsrc) {
 define void @f7(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f7:
 ; CHECK: algf {{%r[0-5]}}, -4(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -117,7 +117,7 @@ define void @f7(i128 *%aptr, i32 *%bsrc) {
 define void @f8(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f8:
 ; CHECK: algf {{%r[0-5]}}, -524288(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
@@ -135,7 +135,7 @@ define void @f9(i128 *%aptr, i32 *%bsrc) {
 ; CHECK-LABEL: f9:
 ; CHECK: agfi %r3, -524292
 ; CHECK: algf {{%r[0-5]}}, 0(%r3)
-; CHECK: alcgr
+; CHECK: alcg
 ; CHECK: br %r14
   %a = load i128 *%aptr
   %xor = xor i128 %a, 127
diff --git a/test/CodeGen/SystemZ/int-add-11.ll b/test/CodeGen/SystemZ/int-add-11.ll
index 6c617ba..679c206 100644
--- a/test/CodeGen/SystemZ/int-add-11.ll
+++ b/test/CodeGen/SystemZ/int-add-11.ll
@@ -1,6 +1,7 @@
-; Test 32-bit additions of constants to memory.
+; Test 32-bit additions of constants to memory.  The tests here
+; assume z10 register pressure, without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Check additions of 1.
 define void @f1(i32 *%ptr) {
@@ -126,3 +127,169 @@ define void @f10(i64 %base, i64 %index) {
   store i32 %add, i32 *%ptr
   ret void
 }
+
+; Check that adding 127 to a spilled value can use ASI.
+define void @f11(i32 *%ptr, i32 %sel) {
+; CHECK-LABEL: f11:
+; CHECK: asi {{[0-9]+}}(%r15), 127
+; CHECK: br %r14
+entry:
+  %val0 = load volatile i32 *%ptr
+  %val1 = load volatile i32 *%ptr
+  %val2 = load volatile i32 *%ptr
+  %val3 = load volatile i32 *%ptr
+  %val4 = load volatile i32 *%ptr
+  %val5 = load volatile i32 *%ptr
+  %val6 = load volatile i32 *%ptr
+  %val7 = load volatile i32 *%ptr
+  %val8 = load volatile i32 *%ptr
+  %val9 = load volatile i32 *%ptr
+  %val10 = load volatile i32 *%ptr
+  %val11 = load volatile i32 *%ptr
+  %val12 = load volatile i32 *%ptr
+  %val13 = load volatile i32 *%ptr
+  %val14 = load volatile i32 *%ptr
+  %val15 = load volatile i32 *%ptr
+
+  %test = icmp ne i32 %sel, 0
+  br i1 %test, label %add, label %store
+
+add:
+  %add0 = add i32 %val0, 127
+  %add1 = add i32 %val1, 127
+  %add2 = add i32 %val2, 127
+  %add3 = add i32 %val3, 127
+  %add4 = add i32 %val4, 127
+  %add5 = add i32 %val5, 127
+  %add6 = add i32 %val6, 127
+  %add7 = add i32 %val7, 127
+  %add8 = add i32 %val8, 127
+  %add9 = add i32 %val9, 127
+  %add10 = add i32 %val10, 127
+  %add11 = add i32 %val11, 127
+  %add12 = add i32 %val12, 127
+  %add13 = add i32 %val13, 127
+  %add14 = add i32 %val14, 127
+  %add15 = add i32 %val15, 127
+  br label %store
+
+store:
+  %new0 = phi i32 [ %val0, %entry ], [ %add0, %add ]
+  %new1 = phi i32 [ %val1, %entry ], [ %add1, %add ]
+  %new2 = phi i32 [ %val2, %entry ], [ %add2, %add ]
+  %new3 = phi i32 [ %val3, %entry ], [ %add3, %add ]
+  %new4 = phi i32 [ %val4, %entry ], [ %add4, %add ]
+  %new5 = phi i32 [ %val5, %entry ], [ %add5, %add ]
+  %new6 = phi i32 [ %val6, %entry ], [ %add6, %add ]
+  %new7 = phi i32 [ %val7, %entry ], [ %add7, %add ]
+  %new8 = phi i32 [ %val8, %entry ], [ %add8, %add ]
+  %new9 = phi i32 [ %val9, %entry ], [ %add9, %add ]
+  %new10 = phi i32 [ %val10, %entry ], [ %add10, %add ]
+  %new11 = phi i32 [ %val11, %entry ], [ %add11, %add ]
+  %new12 = phi i32 [ %val12, %entry ], [ %add12, %add ]
+  %new13 = phi i32 [ %val13, %entry ], [ %add13, %add ]
+  %new14 = phi i32 [ %val14, %entry ], [ %add14, %add ]
+  %new15 = phi i32 [ %val15, %entry ], [ %add15, %add ]
+
+  store volatile i32 %new0, i32 *%ptr
+  store volatile i32 %new1, i32 *%ptr
+  store volatile i32 %new2, i32 *%ptr
+  store volatile i32 %new3, i32 *%ptr
+  store volatile i32 %new4, i32 *%ptr
+  store volatile i32 %new5, i32 *%ptr
+  store volatile i32 %new6, i32 *%ptr
+  store volatile i32 %new7, i32 *%ptr
+  store volatile i32 %new8, i32 *%ptr
+  store volatile i32 %new9, i32 *%ptr
+  store volatile i32 %new10, i32 *%ptr
+  store volatile i32 %new11, i32 *%ptr
+  store volatile i32 %new12, i32 *%ptr
+  store volatile i32 %new13, i32 *%ptr
+  store volatile i32 %new14, i32 *%ptr
+  store volatile i32 %new15, i32 *%ptr
+
+  ret void
+}
+
+; Check that adding -128 to a spilled value can use ASI.
+define void @f12(i32 *%ptr, i32 %sel) {
+; CHECK-LABEL: f12:
+; CHECK: asi {{[0-9]+}}(%r15), -128
+; CHECK: br %r14
+entry:
+  %val0 = load volatile i32 *%ptr
+  %val1 = load volatile i32 *%ptr
+  %val2 = load volatile i32 *%ptr
+  %val3 = load volatile i32 *%ptr
+  %val4 = load volatile i32 *%ptr
+  %val5 = load volatile i32 *%ptr
+  %val6 = load volatile i32 *%ptr
+  %val7 = load volatile i32 *%ptr
+  %val8 = load volatile i32 *%ptr
+  %val9 = load volatile i32 *%ptr
+  %val10 = load volatile i32 *%ptr
+  %val11 = load volatile i32 *%ptr
+  %val12 = load volatile i32 *%ptr
+  %val13 = load volatile i32 *%ptr
+  %val14 = load volatile i32 *%ptr
+  %val15 = load volatile i32 *%ptr
+
+  %test = icmp ne i32 %sel, 0
+  br i1 %test, label %add, label %store
+
+add:
+  %add0 = add i32 %val0, -128
+  %add1 = add i32 %val1, -128
+  %add2 = add i32 %val2, -128
+  %add3 = add i32 %val3, -128
+  %add4 = add i32 %val4, -128
+  %add5 = add i32 %val5, -128
+  %add6 = add i32 %val6, -128
+  %add7 = add i32 %val7, -128
+  %add8 = add i32 %val8, -128
+  %add9 = add i32 %val9, -128
+  %add10 = add i32 %val10, -128
+  %add11 = add i32 %val11, -128
+  %add12 = add i32 %val12, -128
+  %add13 = add i32 %val13, -128
+  %add14 = add i32 %val14, -128
+  %add15 = add i32 %val15, -128
+  br label %store
+
+store:
+  %new0 = phi i32 [ %val0, %entry ], [ %add0, %add ]
+  %new1 = phi i32 [ %val1, %entry ], [ %add1, %add ]
+  %new2 = phi i32 [ %val2, %entry ], [ %add2, %add ]
+  %new3 = phi i32 [ %val3, %entry ], [ %add3, %add ]
+  %new4 = phi i32 [ %val4, %entry ], [ %add4, %add ]
+  %new5 = phi i32 [ %val5, %entry ], [ %add5, %add ]
+  %new6 = phi i32 [ %val6, %entry ], [ %add6, %add ]
+  %new7 = phi i32 [ %val7, %entry ], [ %add7, %add ]
+  %new8 = phi i32 [ %val8, %entry ], [ %add8, %add ]
+  %new9 = phi i32 [ %val9, %entry ], [ %add9, %add ]
+  %new10 = phi i32 [ %val10, %entry ], [ %add10, %add ]
+  %new11 = phi i32 [ %val11, %entry ], [ %add11, %add ]
+  %new12 = phi i32 [ %val12, %entry ], [ %add12, %add ]
+  %new13 = phi i32 [ %val13, %entry ], [ %add13, %add ]
+  %new14 = phi i32 [ %val14, %entry ], [ %add14, %add ]
+  %new15 = phi i32 [ %val15, %entry ], [ %add15, %add ]
+
+  store volatile i32 %new0, i32 *%ptr
+  store volatile i32 %new1, i32 *%ptr
+  store volatile i32 %new2, i32 *%ptr
+  store volatile i32 %new3, i32 *%ptr
+  store volatile i32 %new4, i32 *%ptr
+  store volatile i32 %new5, i32 *%ptr
+  store volatile i32 %new6, i32 *%ptr
+  store volatile i32 %new7, i32 *%ptr
+  store volatile i32 %new8, i32 *%ptr
+  store volatile i32 %new9, i32 *%ptr
+  store volatile i32 %new10, i32 *%ptr
+  store volatile i32 %new11, i32 *%ptr
+  store volatile i32 %new12, i32 *%ptr
+  store volatile i32 %new13, i32 *%ptr
+  store volatile i32 %new14, i32 *%ptr
+  store volatile i32 %new15, i32 *%ptr
+
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-add-12.ll b/test/CodeGen/SystemZ/int-add-12.ll
index ef4dc39..741cce1 100644
--- a/test/CodeGen/SystemZ/int-add-12.ll
+++ b/test/CodeGen/SystemZ/int-add-12.ll
@@ -126,3 +126,169 @@ define void @f10(i64 %base, i64 %index) {
   store i64 %add, i64 *%ptr
   ret void
 }
+
+; Check that adding 127 to a spilled value can use AGSI.
+define void @f11(i64 *%ptr, i32 %sel) {
+; CHECK-LABEL: f11:
+; CHECK: agsi {{[0-9]+}}(%r15), 127
+; CHECK: br %r14
+entry:
+  %val0 = load volatile i64 *%ptr
+  %val1 = load volatile i64 *%ptr
+  %val2 = load volatile i64 *%ptr
+  %val3 = load volatile i64 *%ptr
+  %val4 = load volatile i64 *%ptr
+  %val5 = load volatile i64 *%ptr
+  %val6 = load volatile i64 *%ptr
+  %val7 = load volatile i64 *%ptr
+  %val8 = load volatile i64 *%ptr
+  %val9 = load volatile i64 *%ptr
+  %val10 = load volatile i64 *%ptr
+  %val11 = load volatile i64 *%ptr
+  %val12 = load volatile i64 *%ptr
+  %val13 = load volatile i64 *%ptr
+  %val14 = load volatile i64 *%ptr
+  %val15 = load volatile i64 *%ptr
+
+  %test = icmp ne i32 %sel, 0
+  br i1 %test, label %add, label %store
+
+add:
+  %add0 = add i64 %val0, 127
+  %add1 = add i64 %val1, 127
+  %add2 = add i64 %val2, 127
+  %add3 = add i64 %val3, 127
+  %add4 = add i64 %val4, 127
+  %add5 = add i64 %val5, 127
+  %add6 = add i64 %val6, 127
+  %add7 = add i64 %val7, 127
+  %add8 = add i64 %val8, 127
+  %add9 = add i64 %val9, 127
+  %add10 = add i64 %val10, 127
+  %add11 = add i64 %val11, 127
+  %add12 = add i64 %val12, 127
+  %add13 = add i64 %val13, 127
+  %add14 = add i64 %val14, 127
+  %add15 = add i64 %val15, 127
+  br label %store
+
+store:
+  %new0 = phi i64 [ %val0, %entry ], [ %add0, %add ]
+  %new1 = phi i64 [ %val1, %entry ], [ %add1, %add ]
+  %new2 = phi i64 [ %val2, %entry ], [ %add2, %add ]
+  %new3 = phi i64 [ %val3, %entry ], [ %add3, %add ]
+  %new4 = phi i64 [ %val4, %entry ], [ %add4, %add ]
+  %new5 = phi i64 [ %val5, %entry ], [ %add5, %add ]
+  %new6 = phi i64 [ %val6, %entry ], [ %add6, %add ]
+  %new7 = phi i64 [ %val7, %entry ], [ %add7, %add ]
+  %new8 = phi i64 [ %val8, %entry ], [ %add8, %add ]
+  %new9 = phi i64 [ %val9, %entry ], [ %add9, %add ]
+  %new10 = phi i64 [ %val10, %entry ], [ %add10, %add ]
+  %new11 = phi i64 [ %val11, %entry ], [ %add11, %add ]
+  %new12 = phi i64 [ %val12, %entry ], [ %add12, %add ]
+  %new13 = phi i64 [ %val13, %entry ], [ %add13, %add ]
+  %new14 = phi i64 [ %val14, %entry ], [ %add14, %add ]
+  %new15 = phi i64 [ %val15, %entry ], [ %add15, %add ]
+
+  store volatile i64 %new0, i64 *%ptr
+  store volatile i64 %new1, i64 *%ptr
+  store volatile i64 %new2, i64 *%ptr
+  store volatile i64 %new3, i64 *%ptr
+  store volatile i64 %new4, i64 *%ptr
+  store volatile i64 %new5, i64 *%ptr
+  store volatile i64 %new6, i64 *%ptr
+  store volatile i64 %new7, i64 *%ptr
+  store volatile i64 %new8, i64 *%ptr
+  store volatile i64 %new9, i64 *%ptr
+  store volatile i64 %new10, i64 *%ptr
+  store volatile i64 %new11, i64 *%ptr
+  store volatile i64 %new12, i64 *%ptr
+  store volatile i64 %new13, i64 *%ptr
+  store volatile i64 %new14, i64 *%ptr
+  store volatile i64 %new15, i64 *%ptr
+
+  ret void
+}
+
+; Check that adding -128 to a spilled value can use AGSI.
+define void @f12(i64 *%ptr, i32 %sel) {
+; CHECK-LABEL: f12:
+; CHECK: agsi {{[0-9]+}}(%r15), -128
+; CHECK: br %r14
+entry:
+  %val0 = load volatile i64 *%ptr
+  %val1 = load volatile i64 *%ptr
+  %val2 = load volatile i64 *%ptr
+  %val3 = load volatile i64 *%ptr
+  %val4 = load volatile i64 *%ptr
+  %val5 = load volatile i64 *%ptr
+  %val6 = load volatile i64 *%ptr
+  %val7 = load volatile i64 *%ptr
+  %val8 = load volatile i64 *%ptr
+  %val9 = load volatile i64 *%ptr
+  %val10 = load volatile i64 *%ptr
+  %val11 = load volatile i64 *%ptr
+  %val12 = load volatile i64 *%ptr
+  %val13 = load volatile i64 *%ptr
+  %val14 = load volatile i64 *%ptr
+  %val15 = load volatile i64 *%ptr
+
+  %test = icmp ne i32 %sel, 0
+  br i1 %test, label %add, label %store
+
+add:
+  %add0 = add i64 %val0, -128
+  %add1 = add i64 %val1, -128
+  %add2 = add i64 %val2, -128
+  %add3 = add i64 %val3, -128
+  %add4 = add i64 %val4, -128
+  %add5 = add i64 %val5, -128
+  %add6 = add i64 %val6, -128
+  %add7 = add i64 %val7, -128
+  %add8 = add i64 %val8, -128
+  %add9 = add i64 %val9, -128
+  %add10 = add i64 %val10, -128
+  %add11 = add i64 %val11, -128
+  %add12 = add i64 %val12, -128
+  %add13 = add i64 %val13, -128
+  %add14 = add i64 %val14, -128
+  %add15 = add i64 %val15, -128
+  br label %store
+
+store:
+  %new0 = phi i64 [ %val0, %entry ], [ %add0, %add ]
+  %new1 = phi i64 [ %val1, %entry ], [ %add1, %add ]
+  %new2 = phi i64 [ %val2, %entry ], [ %add2, %add ]
+  %new3 = phi i64 [ %val3, %entry ], [ %add3, %add ]
+  %new4 = phi i64 [ %val4, %entry ], [ %add4, %add ]
+  %new5 = phi i64 [ %val5, %entry ], [ %add5, %add ]
+  %new6 = phi i64 [ %val6, %entry ], [ %add6, %add ]
+  %new7 = phi i64 [ %val7, %entry ], [ %add7, %add ]
+  %new8 = phi i64 [ %val8, %entry ], [ %add8, %add ]
+  %new9 = phi i64 [ %val9, %entry ], [ %add9, %add ]
+  %new10 = phi i64 [ %val10, %entry ], [ %add10, %add ]
+  %new11 = phi i64 [ %val11, %entry ], [ %add11, %add ]
+  %new12 = phi i64 [ %val12, %entry ], [ %add12, %add ]
+  %new13 = phi i64 [ %val13, %entry ], [ %add13, %add ]
+  %new14 = phi i64 [ %val14, %entry ], [ %add14, %add ]
+  %new15 = phi i64 [ %val15, %entry ], [ %add15, %add ]
+
+  store volatile i64 %new0, i64 *%ptr
+  store volatile i64 %new1, i64 *%ptr
+  store volatile i64 %new2, i64 *%ptr
+  store volatile i64 %new3, i64 *%ptr
+  store volatile i64 %new4, i64 *%ptr
+  store volatile i64 %new5, i64 *%ptr
+  store volatile i64 %new6, i64 *%ptr
+  store volatile i64 %new7, i64 *%ptr
+  store volatile i64 %new8, i64 *%ptr
+  store volatile i64 %new9, i64 *%ptr
+  store volatile i64 %new10, i64 *%ptr
+  store volatile i64 %new11, i64 *%ptr
+  store volatile i64 %new12, i64 *%ptr
+  store volatile i64 %new13, i64 *%ptr
+  store volatile i64 %new14, i64 *%ptr
+  store volatile i64 %new15, i64 *%ptr
+
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-01.ll b/test/CodeGen/SystemZ/int-cmp-01.ll
index dbfe0df..6653b6f 100644
--- a/test/CodeGen/SystemZ/int-cmp-01.ll
+++ b/test/CodeGen/SystemZ/int-cmp-01.ll
@@ -149,3 +149,17 @@ define void @f10(i32 %lhs, i64 %base, i64 %index, i32 *%dst) {
   store i32 %res, i32 *%dst
   ret void
 }
+
+; Check the comparison can be reversed if that allows CH to be used.
+define double @f11(double %a, double %b, i32 %rhs, i16 *%src) {
+; CHECK-LABEL: f11:
+; CHECK: ch %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %half = load i16 *%src
+  %lhs = sext i16 %half to i32
+  %cond = icmp slt i32 %lhs, %rhs
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-02.ll b/test/CodeGen/SystemZ/int-cmp-02.ll
index 26e1391..4a8a1a9 100644
--- a/test/CodeGen/SystemZ/int-cmp-02.ll
+++ b/test/CodeGen/SystemZ/int-cmp-02.ll
@@ -181,3 +181,16 @@ while.body:
 while.end:
   ret void
 }
+
+; Check the comparison can be reversed if that allows C to be used.
+define double @f13(double %a, double %b, i32 %i2, i32 *%ptr) {
+; CHECK-LABEL: f13:
+; CHECK: c %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %i1 = load i32 *%ptr
+  %cond = icmp slt i32 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-03.ll b/test/CodeGen/SystemZ/int-cmp-03.ll
index 2d679cf..aa654e0 100644
--- a/test/CodeGen/SystemZ/int-cmp-03.ll
+++ b/test/CodeGen/SystemZ/int-cmp-03.ll
@@ -5,8 +5,7 @@
 ; Check register comparison.
 define double @f1(double %a, double %b, i32 %i1, i32 %i2) {
 ; CHECK-LABEL: f1:
-; CHECK: clr %r2, %r3
-; CHECK-NEXT: jl
+; CHECK: clrjl %r2, %r3
 ; CHECK: ldr %f0, %f2
 ; CHECK: br %r14
   %cond = icmp ult i32 %i1, %i2
@@ -160,3 +159,16 @@ define double @f11(double %a, double %b, i32 %i1, i64 %base, i64 %index) {
   %res = select i1 %cond, double %a, double %b
   ret double %res
 }
+
+; Check the comparison can be reversed if that allows CL to be used.
+define double @f12(double %a, double %b, i32 %i2, i32 *%ptr) {
+; CHECK-LABEL: f12:
+; CHECK: cl %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %i1 = load i32 *%ptr
+  %cond = icmp ult i32 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-04.ll b/test/CodeGen/SystemZ/int-cmp-04.ll
index 54c4b5b..a6606f3 100644
--- a/test/CodeGen/SystemZ/int-cmp-04.ll
+++ b/test/CodeGen/SystemZ/int-cmp-04.ll
@@ -105,3 +105,17 @@ define void @f7(i64 %lhs, i64 %base, i64 %index, i64 *%dst) {
   store i64 %res, i64 *%dst
   ret void
 }
+
+; Check the comparison can be reversed if that allows CGH to be used.
+define double @f8(double %a, double %b, i64 %rhs, i16 *%src) {
+; CHECK-LABEL: f8:
+; CHECK: cgh %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %half = load i16 *%src
+  %lhs = sext i16 %half to i64
+  %cond = icmp slt i64 %lhs, %rhs
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-05.ll b/test/CodeGen/SystemZ/int-cmp-05.ll
index 36d12a5..f15b76b 100644
--- a/test/CodeGen/SystemZ/int-cmp-05.ll
+++ b/test/CodeGen/SystemZ/int-cmp-05.ll
@@ -54,7 +54,7 @@ define double @f4(double %a, double %b, i64 %i1, i32 %unext) {
   ret double %res
 }
 
-; Check signed comparisonn with memory.
+; Check signed comparison with memory.
 define double @f5(double %a, double %b, i64 %i1, i32 *%ptr) {
 ; CHECK-LABEL: f5:
 ; CHECK: cgf %r2, 0(%r3)
@@ -290,3 +290,17 @@ define i64 @f15(i32 *%ptr0) {
 
   ret i64 %sel9
 }
+
+; Check the comparison can be reversed if that allows CGF to be used.
+define double @f16(double %a, double %b, i64 %i2, i32 *%ptr) {
+; CHECK-LABEL: f16:
+; CHECK: cgf %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %unext = load i32 *%ptr
+  %i1 = sext i32 %unext to i64
+  %cond = icmp slt i64 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-06.ll b/test/CodeGen/SystemZ/int-cmp-06.ll
index cdd6114..8ab62e8 100644
--- a/test/CodeGen/SystemZ/int-cmp-06.ll
+++ b/test/CodeGen/SystemZ/int-cmp-06.ll
@@ -104,7 +104,7 @@ define double @f8(double %a, double %b, i64 %i1, i64 %unext) {
   ret double %res
 }
 
-; Check unsigned comparisonn with memory.
+; Check unsigned comparison with memory.
 define double @f9(double %a, double %b, i64 %i1, i32 *%ptr) {
 ; CHECK-LABEL: f9:
 ; CHECK: clgf %r2, 0(%r3)
@@ -340,3 +340,17 @@ define i64 @f19(i32 *%ptr0) {
 
   ret i64 %sel9
 }
+
+; Check the comparison can be reversed if that allows CLGF to be used.
+define double @f20(double %a, double %b, i64 %i2, i32 *%ptr) {
+; CHECK-LABEL: f20:
+; CHECK: clgf %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %unext = load i32 *%ptr
+  %i1 = zext i32 %unext to i64
+  %cond = icmp ult i64 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-07.ll b/test/CodeGen/SystemZ/int-cmp-07.ll
index 3308cb0..530d178 100644
--- a/test/CodeGen/SystemZ/int-cmp-07.ll
+++ b/test/CodeGen/SystemZ/int-cmp-07.ll
@@ -115,3 +115,16 @@ define double @f8(double %a, double %b, i64 %i1, i64 %base, i64 %index) {
   %res = select i1 %cond, double %a, double %b
   ret double %res
 }
+
+; Check the comparison can be reversed if that allows CG to be used.
+define double @f9(double %a, double %b, i64 %i2, i64 *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: cg %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %i1 = load i64 *%ptr
+  %cond = icmp slt i64 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-08.ll b/test/CodeGen/SystemZ/int-cmp-08.ll
index e68a0fe..ebf158a 100644
--- a/test/CodeGen/SystemZ/int-cmp-08.ll
+++ b/test/CodeGen/SystemZ/int-cmp-08.ll
@@ -5,8 +5,7 @@
 ; Check CLGR.
 define double @f1(double %a, double %b, i64 %i1, i64 %i2) {
 ; CHECK-LABEL: f1:
-; CHECK: clgr %r2, %r3
-; CHECK-NEXT: jl
+; CHECK: clgrjl %r2, %r3
 ; CHECK: ldr %f0, %f2
 ; CHECK: br %r14
   %cond = icmp ult i64 %i1, %i2
@@ -116,3 +115,16 @@ define double @f8(double %a, double %b, i64 %i1, i64 %base, i64 %index) {
   %res = select i1 %cond, double %a, double %b
   ret double %res
 }
+
+; Check the comparison can be reversed if that allows CLG to be used.
+define double @f9(double %a, double %b, i64 %i2, i64 *%ptr) {
+; CHECK-LABEL: f9:
+; CHECK: clg %r2, 0(%r3)
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %i1 = load i64 *%ptr
+  %cond = icmp ult i64 %i1, %i2
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-10.ll b/test/CodeGen/SystemZ/int-cmp-10.ll
index e30e014..4d4c4bb 100644
--- a/test/CodeGen/SystemZ/int-cmp-10.ll
+++ b/test/CodeGen/SystemZ/int-cmp-10.ll
@@ -2,12 +2,11 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; Check a value near the low end of the range.  We use CFI for comparisons
-; with zero, or things that are equivalent to them.
+; Check a value near the low end of the range.  We use signed forms for
+; comparisons with zero, or things that are equivalent to them.
 define double @f1(double %a, double %b, i32 %i1) {
 ; CHECK-LABEL: f1:
-; CHECK: clfi %r2, 1
-; CHECK-NEXT: jh
+; CHECK: clijh %r2, 1
 ; CHECK: ldr %f0, %f2
 ; CHECK: br %r14
   %cond = icmp ugt i32 %i1, 1
@@ -15,9 +14,32 @@ define double @f1(double %a, double %b, i32 %i1) {
   ret double %res
 }
 
-; Check a value near the high end of the range.
+; Check the top of the CLIJ range.
 define double @f2(double %a, double %b, i32 %i1) {
 ; CHECK-LABEL: f2:
+; CHECK: clijl %r2, 255
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %cond = icmp ult i32 %i1, 255
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Check the next value up, which needs a separate comparison.
+define double @f3(double %a, double %b, i32 %i1) {
+; CHECK-LABEL: f3:
+; CHECK: clfi %r2, 256
+; CHECK: jl
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %cond = icmp ult i32 %i1, 256
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Check a value near the high end of the range.
+define double @f4(double %a, double %b, i32 %i1) {
+; CHECK-LABEL: f4:
 ; CHECK: clfi %r2, 4294967280
 ; CHECK-NEXT: jl
 ; CHECK: ldr %f0, %f2
diff --git a/test/CodeGen/SystemZ/int-cmp-12.ll b/test/CodeGen/SystemZ/int-cmp-12.ll
index f57f6ec..077b224 100644
--- a/test/CodeGen/SystemZ/int-cmp-12.ll
+++ b/test/CodeGen/SystemZ/int-cmp-12.ll
@@ -2,12 +2,11 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-; Check a value near the low end of the range.  We use CGFI for comparisons
-; with zero, or things that are equivalent to them.
+; Check a value near the low end of the range.  We use signed forms for
+; comparisons with zero, or things that are equivalent to them.
 define double @f1(double %a, double %b, i64 %i1) {
 ; CHECK-LABEL: f1:
-; CHECK: clgfi %r2, 1
-; CHECK-NEXT: jh
+; CHECK: clgijh %r2, 1
 ; CHECK: ldr %f0, %f2
 ; CHECK: br %r14
   %cond = icmp ugt i64 %i1, 1
@@ -15,9 +14,32 @@ define double @f1(double %a, double %b, i64 %i1) {
   ret double %res
 }
 
-; Check the high end of the CLGFI range.
+; Check the top of the CLGIJ range.
 define double @f2(double %a, double %b, i64 %i1) {
 ; CHECK-LABEL: f2:
+; CHECK: clgijl %r2, 255
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %cond = icmp ult i64 %i1, 255
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Check the next value up, which needs a separate comparison.
+define double @f3(double %a, double %b, i64 %i1) {
+; CHECK-LABEL: f3:
+; CHECK: clgfi %r2, 256
+; CHECK: jl
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %cond = icmp ult i64 %i1, 256
+  %res = select i1 %cond, double %a, double %b
+  ret double %res
+}
+
+; Check the high end of the CLGFI range.
+define double @f4(double %a, double %b, i64 %i1) {
+; CHECK-LABEL: f4:
 ; CHECK: clgfi %r2, 4294967295
 ; CHECK-NEXT: jl
 ; CHECK: ldr %f0, %f2
@@ -28,10 +50,9 @@ define double @f2(double %a, double %b, i64 %i1) {
 }
 
 ; Check the next value up, which must use a register comparison.
-define double @f3(double %a, double %b, i64 %i1) {
-; CHECK-LABEL: f3:
-; CHECK: clgr %r2,
-; CHECK-NEXT: jl
+define double @f5(double %a, double %b, i64 %i1) {
+; CHECK-LABEL: f5:
+; CHECK: clgrjl %r2,
 ; CHECK: ldr %f0, %f2
 ; CHECK: br %r14
   %cond = icmp ult i64 %i1, 4294967296
diff --git a/test/CodeGen/SystemZ/int-cmp-20.ll b/test/CodeGen/SystemZ/int-cmp-20.ll
index 7ecde77..98c41cd 100644
--- a/test/CodeGen/SystemZ/int-cmp-20.ll
+++ b/test/CodeGen/SystemZ/int-cmp-20.ll
@@ -63,7 +63,7 @@ define double @f4(double %a, double %b, i8 *%ptr) {
 ; extension.  The condition is always true.
 define double @f5(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f5:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = zext i8 %val to i32
@@ -79,7 +79,7 @@ define double @f5(double %a, double %b, i8 *%ptr) {
 ; and simply ignore CLI for this range.  First check the low end of the range.
 define double @f6(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f6:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = sext i8 %val to i32
@@ -91,7 +91,7 @@ define double @f6(double %a, double %b, i8 *%ptr) {
 ; ...and then the high end.
 define double @f7(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f7:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = sext i8 %val to i32
@@ -118,7 +118,7 @@ define double @f8(double %a, double %b, i8 *%ptr) {
 ; extension.  This cannot use CLI.
 define double @f9(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f9:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = sext i8 %val to i32
@@ -145,7 +145,7 @@ define double @f10(double %a, double %b, i8 *%ptr) {
 ; extension.  This cannot use CLI.
 define double @f11(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f11:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = sext i8 %val to i32
@@ -158,7 +158,7 @@ define double @f11(double %a, double %b, i8 *%ptr) {
 ; extension.  The condition is always true.
 define double @f12(double %a, double %b, i8 *%ptr) {
 ; CHECK-LABEL: f12:
-; CHECK-NOT: cli
+; CHECK-NOT: cli {{.*}}
 ; CHECK: br %r14
   %val = load i8 *%ptr
   %ext = zext i8 %val to i32
diff --git a/test/CodeGen/SystemZ/int-cmp-36.ll b/test/CodeGen/SystemZ/int-cmp-36.ll
index 831b05f..fa2d4bf 100644
--- a/test/CodeGen/SystemZ/int-cmp-36.ll
+++ b/test/CodeGen/SystemZ/int-cmp-36.ll
@@ -100,3 +100,22 @@ exit:
   %res = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
   ret i32 %res
 }
+
+; Check the comparison can be reversed if that allows CHRL to be used.
+define i32 @f6(i32 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: chrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i16 *@g
+  %src1 = sext i16 %val to i32
+  %cond = icmp slt i32 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i32 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i32 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-37.ll b/test/CodeGen/SystemZ/int-cmp-37.ll
index 97d210e..8095ed1 100644
--- a/test/CodeGen/SystemZ/int-cmp-37.ll
+++ b/test/CodeGen/SystemZ/int-cmp-37.ll
@@ -86,8 +86,7 @@ define i32 @f5(i32 %src1) {
 ; CHECK-LABEL: f5:
 ; CHECK: lgrl [[REG:%r[0-5]]], h@GOT
 ; CHECK: llh [[VAL:%r[0-5]]], 0([[REG]])
-; CHECK: clr %r2, [[VAL]]
-; CHECK-NEXT: jl
+; CHECK: clrjl %r2, [[VAL]],
 ; CHECK: br %r14
 entry:
   %val = load i16 *@h, align 1
@@ -101,3 +100,22 @@ exit:
   %res = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
   ret i32 %res
 }
+
+; Check the comparison can be reversed if that allows CLHRL to be used.
+define i32 @f6(i32 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: clhrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i16 *@g
+  %src1 = zext i16 %val to i32
+  %cond = icmp ult i32 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i32 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i32 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-38.ll b/test/CodeGen/SystemZ/int-cmp-38.ll
index d5a852c..9017583 100644
--- a/test/CodeGen/SystemZ/int-cmp-38.ll
+++ b/test/CodeGen/SystemZ/int-cmp-38.ll
@@ -115,3 +115,21 @@ exit:
   %res = phi i32 [ %src1, %entry ], [ %mul, %mulb ]
   ret i32 %res
 }
+
+; Check the comparison can be reversed if that allows CRL to be used.
+define i32 @f7(i32 %src2) {
+; CHECK-LABEL: f7:
+; CHECK: crl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %src1 = load i32 *@g
+  %cond = icmp slt i32 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i32 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i32 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-39.ll b/test/CodeGen/SystemZ/int-cmp-39.ll
index d442058..fc9547d 100644
--- a/test/CodeGen/SystemZ/int-cmp-39.ll
+++ b/test/CodeGen/SystemZ/int-cmp-39.ll
@@ -100,3 +100,22 @@ exit:
   %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CGHRL to be used.
+define i64 @f6(i64 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: cghrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i16 *@g
+  %src1 = sext i16 %val to i64
+  %cond = icmp slt i64 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i64 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-40.ll b/test/CodeGen/SystemZ/int-cmp-40.ll
index 6dab2db..9c532f1 100644
--- a/test/CodeGen/SystemZ/int-cmp-40.ll
+++ b/test/CodeGen/SystemZ/int-cmp-40.ll
@@ -86,8 +86,7 @@ define i64 @f5(i64 %src1) {
 ; CHECK-LABEL: f5:
 ; CHECK: lgrl [[REG:%r[0-5]]], h@GOT
 ; CHECK: llgh [[VAL:%r[0-5]]], 0([[REG]])
-; CHECK: clgr %r2, [[VAL]]
-; CHECK-NEXT: jl
+; CHECK: clgrjl %r2, [[VAL]],
 ; CHECK: br %r14
 entry:
   %val = load i16 *@h, align 1
@@ -101,3 +100,22 @@ exit:
   %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CLGHRL to be used.
+define i64 @f6(i64 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: clghrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i16 *@g
+  %src1 = zext i16 %val to i64
+  %cond = icmp ult i64 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i64 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-41.ll b/test/CodeGen/SystemZ/int-cmp-41.ll
index 099681d..77f6e7d 100644
--- a/test/CodeGen/SystemZ/int-cmp-41.ll
+++ b/test/CodeGen/SystemZ/int-cmp-41.ll
@@ -100,3 +100,22 @@ exit:
   %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CGFRL to be used.
+define i64 @f6(i64 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: cgfrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i32 *@g
+  %src1 = sext i32 %val to i64
+  %cond = icmp slt i64 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i64 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-42.ll b/test/CodeGen/SystemZ/int-cmp-42.ll
index 26a268d..94ef008 100644
--- a/test/CodeGen/SystemZ/int-cmp-42.ll
+++ b/test/CodeGen/SystemZ/int-cmp-42.ll
@@ -100,3 +100,22 @@ exit:
   %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CLGFRL to be used.
+define i64 @f6(i64 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: clgfrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %val = load i32 *@g
+  %src1 = zext i32 %val to i64
+  %cond = icmp ult i64 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i64 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-43.ll b/test/CodeGen/SystemZ/int-cmp-43.ll
index e5e1390..1a62588 100644
--- a/test/CodeGen/SystemZ/int-cmp-43.ll
+++ b/test/CodeGen/SystemZ/int-cmp-43.ll
@@ -96,3 +96,21 @@ exit:
   %res = phi i64 [ %src1, %entry ], [ %mul, %mulb ]
   ret i64 %res
 }
+
+; Check the comparison can be reversed if that allows CGRL to be used.
+define i64 @f6(i64 %src2) {
+; CHECK-LABEL: f6:
+; CHECK: cgrl %r2, g
+; CHECK-NEXT: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %src1 = load i64 *@g
+  %cond = icmp slt i64 %src1, %src2
+  br i1 %cond, label %exit, label %mulb
+mulb:
+  %mul = mul i64 %src2, %src2
+  br label %exit
+exit:
+  %res = phi i64 [ %src2, %entry ], [ %mul, %mulb ]
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-44.ll b/test/CodeGen/SystemZ/int-cmp-44.ll
index b94f482..ae0133f 100644
--- a/test/CodeGen/SystemZ/int-cmp-44.ll
+++ b/test/CodeGen/SystemZ/int-cmp-44.ll
@@ -203,11 +203,11 @@ exit:
 ; comparisons with zero if the immediate covers the whole register.
 define i32 @f11(i32 %a, i32 %b, i32 *%dest) {
 ; CHECK-LABEL: f11:
-; CHECK: nilf %r2, 100
+; CHECK: nilf %r2, 100000001
 ; CHECK-NEXT: jl .L{{.*}}
 ; CHECK: br %r14
 entry:
-  %res = and i32 %a, 100
+  %res = and i32 %a, 100000001
   %cmp = icmp ne i32 %res, 0
   br i1 %cmp, label %exit, label %store
 
diff --git a/test/CodeGen/SystemZ/int-cmp-46.ll b/test/CodeGen/SystemZ/int-cmp-46.ll
new file mode 100644
index 0000000..f311942
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-46.ll
@@ -0,0 +1,491 @@
+; Test the use of TEST UNDER MASK for 32-bit operations.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+@g = global i32 0
+
+; Check the lowest useful TMLL value.
+define void @f1(i32 %a) {
+; CHECK-LABEL: f1:
+; CHECK: tmll %r2, 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMLL range.
+define void @f2(i32 %a) {
+; CHECK-LABEL: f2:
+; CHECK: tmll %r2, 65535
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 65535
+  %cmp = icmp ne i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the lowest useful TMLH value, which is the next value up.
+define void @f3(i32 %a) {
+; CHECK-LABEL: f3:
+; CHECK: tmlh %r2, 1
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 65536
+  %cmp = icmp ne i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the next value up again, which cannot use TM.
+define void @f4(i32 %a) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: {{tm[lh].}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 4294901759
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMLH range.
+define void @f5(i32 %a) {
+; CHECK-LABEL: f5:
+; CHECK: tmlh %r2, 65535
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 4294901760
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for LT comparisons that are equivalent to
+; an equality comparison with zero.
+define void @f6(i32 %a) {
+; CHECK-LABEL: f6:
+; CHECK: tmll %r2, 240
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 240
+  %cmp = icmp slt i32 %and, 16
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again with LE.
+define void @f7(i32 %a) {
+; CHECK-LABEL: f7:
+; CHECK: tmll %r2, 240
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 240
+  %cmp = icmp sle i32 %and, 15
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for GE comparisons that are equivalent to
+; an inequality comparison with zero.
+define void @f8(i32 %a) {
+; CHECK-LABEL: f8:
+; CHECK: tmll %r2, 240
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 240
+  %cmp = icmp uge i32 %and, 16
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again with GT.
+define void @f9(i32 %a) {
+; CHECK-LABEL: f9:
+; CHECK: tmll %r2, 240
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 240
+  %cmp = icmp ugt i32 %and, 15
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for LT comparisons that effectively
+; test whether the top bit is clear.
+define void @f10(i32 %a) {
+; CHECK-LABEL: f10:
+; CHECK: tmll %r2, 35
+; CHECK: jle {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 35
+  %cmp = icmp ult i32 %and, 8
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again with LE.
+define void @f11(i32 %a) {
+; CHECK-LABEL: f11:
+; CHECK: tmll %r2, 35
+; CHECK: jle {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 35
+  %cmp = icmp ule i32 %and, 31
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for GE comparisons that effectively test
+; whether the top bit is set.
+define void @f12(i32 %a) {
+; CHECK-LABEL: f12:
+; CHECK: tmll %r2, 140
+; CHECK: jnle {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 140
+  %cmp = icmp uge i32 %and, 128
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again for GT.
+define void @f13(i32 %a) {
+; CHECK-LABEL: f13:
+; CHECK: tmll %r2, 140
+; CHECK: jnle {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 140
+  %cmp = icmp ugt i32 %and, 126
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for equality comparisons with the mask.
+define void @f14(i32 %a) {
+; CHECK-LABEL: f14:
+; CHECK: tmll %r2, 101
+; CHECK: jo {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 101
+  %cmp = icmp eq i32 %and, 101
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for inequality comparisons with the mask.
+define void @f15(i32 %a) {
+; CHECK-LABEL: f15:
+; CHECK: tmll %r2, 65519
+; CHECK: jno {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 65519
+  %cmp = icmp ne i32 %and, 65519
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for LT comparisons that are equivalent
+; to inequality comparisons with the mask.
+define void @f16(i32 %a) {
+; CHECK-LABEL: f16:
+; CHECK: tmll %r2, 130
+; CHECK: jno {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 130
+  %cmp = icmp ult i32 %and, 129
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again with LE.
+define void @f17(i32 %a) {
+; CHECK-LABEL: f17:
+; CHECK: tmll %r2, 130
+; CHECK: jno {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 130
+  %cmp = icmp ule i32 %and, 128
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for GE comparisons that are equivalent
+; to equality comparisons with the mask.
+define void @f18(i32 %a) {
+; CHECK-LABEL: f18:
+; CHECK: tmll %r2, 194
+; CHECK: jo {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 194
+  %cmp = icmp uge i32 %and, 193
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; ...same again for GT.
+define void @f19(i32 %a) {
+; CHECK-LABEL: f19:
+; CHECK: tmll %r2, 194
+; CHECK: jo {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 194
+  %cmp = icmp ugt i32 %and, 192
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for equality comparisons for the low bit
+; when the mask has two bits.
+define void @f20(i32 %a) {
+; CHECK-LABEL: f20:
+; CHECK: tmll %r2, 20
+; CHECK: jl {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 20
+  %cmp = icmp eq i32 %and, 4
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for inequality comparisons for the low bit
+; when the mask has two bits.
+define void @f21(i32 %a) {
+; CHECK-LABEL: f21:
+; CHECK: tmll %r2, 20
+; CHECK: jnl {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 20
+  %cmp = icmp ne i32 %and, 4
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for equality comparisons for the high bit
+; when the mask has two bits.
+define void @f22(i32 %a) {
+; CHECK-LABEL: f22:
+; CHECK: tmll %r2, 20
+; CHECK: jh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 20
+  %cmp = icmp eq i32 %and, 16
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can use TMLL for inequality comparisons for the high bit
+; when the mask has two bits.
+define void @f23(i32 %a) {
+; CHECK-LABEL: f23:
+; CHECK: tmll %r2, 20
+; CHECK: jnh {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i32 %a, 20
+  %cmp = icmp ne i32 %and, 16
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can fold an SHL into a TMxx mask.
+define void @f24(i32 %a) {
+; CHECK-LABEL: f24:
+; CHECK: tmll %r2, 255
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %shl = shl i32 %a, 12
+  %and = and i32 %shl, 1044480
+  %cmp = icmp ne i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can fold an SHR into a TMxx mask.
+define void @f25(i32 %a) {
+; CHECK-LABEL: f25:
+; CHECK: tmlh %r2, 512
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %shr = lshr i32 %a, 25
+  %and = and i32 %shr, 1
+  %cmp = icmp ne i32 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-47.ll b/test/CodeGen/SystemZ/int-cmp-47.ll
new file mode 100644
index 0000000..9ebcbfe
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-47.ll
@@ -0,0 +1,234 @@
+; Test the use of TEST UNDER MASK for 64-bit operations.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+@g = global i32 0
+
+; Check the lowest useful TMLL value.
+define void @f1(i64 %a) {
+; CHECK-LABEL: f1:
+; CHECK: tmll %r2, 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 1
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMLL range.
+define void @f2(i64 %a) {
+; CHECK-LABEL: f2:
+; CHECK: tmll %r2, 65535
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 65535
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the lowest useful TMLH value, which is the next value up.
+define void @f3(i64 %a) {
+; CHECK-LABEL: f3:
+; CHECK: tmlh %r2, 1
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 65536
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the next value up again, which cannot use TM.
+define void @f4(i64 %a) {
+; CHECK-LABEL: f4:
+; CHECK-NOT: {{tm[lh].}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 4294901759
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMLH range.
+define void @f5(i64 %a) {
+; CHECK-LABEL: f5:
+; CHECK: tmlh %r2, 65535
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 4294901760
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the lowest useful TMHL value.
+define void @f6(i64 %a) {
+; CHECK-LABEL: f6:
+; CHECK: tmhl %r2, 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 4294967296
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the next value up again, which cannot use TM.
+define void @f7(i64 %a) {
+; CHECK-LABEL: f7:
+; CHECK-NOT: {{tm[lh].}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 4294967297
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMHL range.
+define void @f8(i64 %a) {
+; CHECK-LABEL: f8:
+; CHECK: tmhl %r2, 65535
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 281470681743360
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the lowest useful TMHH value.
+define void @f9(i64 %a) {
+; CHECK-LABEL: f9:
+; CHECK: tmhh %r2, 1
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 281474976710656
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the high end of the TMHH range.
+define void @f10(i64 %a) {
+; CHECK-LABEL: f10:
+; CHECK: tmhh %r2, 65535
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 18446462598732840960
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can fold an SHL into a TMxx mask.
+define void @f11(i64 %a) {
+; CHECK-LABEL: f11:
+; CHECK: tmhl %r2, 32768
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %shl = shl i64 %a, 1
+  %and = and i64 %shl, 281474976710656
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check that we can fold an SHR into a TMxx mask.
+define void @f12(i64 %a) {
+; CHECK-LABEL: f12:
+; CHECK: tmhh %r2, 256
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %shr = lshr i64 %a, 56
+  %and = and i64 %shr, 1
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-48.ll b/test/CodeGen/SystemZ/int-cmp-48.ll
new file mode 100644
index 0000000..d7c6370
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-48.ll
@@ -0,0 +1,245 @@
+; Test the use of TM and TMY.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+@g = global i32 0
+
+; Check a simple branching use of TM.
+define void @f1(i8 *%src) {
+; CHECK-LABEL: f1:
+; CHECK: tm 0(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %byte = load i8 *%src
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+
+; Check that we do not fold across an aliasing store.
+define void @f2(i8 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
+; CHECK: mvi 0(%r2), 0
+; CHECK: tmll [[REG]], 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+entry:
+  %byte = load i8 *%src
+  store i8 0, i8 *%src
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a simple select-based use of TM.
+define double @f3(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f3:
+; CHECK: tm 0(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check that we do not fold across an aliasing store.
+define double @f4(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f4:
+; CHECK: tm 0(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: mvi 0(%r2), 0
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  store i8 0, i8 *%src
+  ret double %res
+}
+
+; Check an inequality check.
+define double @f5(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f5:
+; CHECK: tm 0(%r2), 1
+; CHECK: jne {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 1
+  %cmp = icmp ne i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check that we can also use TM for equality comparisons with the mask.
+define double @f6(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f6:
+; CHECK: tm 0(%r2), 254
+; CHECK: jo {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 254
+  %cmp = icmp eq i8 %and, 254
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check inequality comparisons with the mask.
+define double @f7(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f7:
+; CHECK: tm 0(%r2), 254
+; CHECK: jno {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 254
+  %cmp = icmp ne i8 %and, 254
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check that we do not use the memory TM instruction when CC is being tested
+; for 2.
+define double @f8(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f8:
+; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
+; CHECK: tmll [[REG]], 3
+; CHECK: jh {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 3
+  %cmp = icmp eq i8 %and, 2
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; ...likewise 1.
+define double @f9(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f9:
+; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
+; CHECK: tmll [[REG]], 3
+; CHECK: jl {{\.L.*}}
+; CHECK: br %r14
+  %byte = load i8 *%src
+  %and = and i8 %byte, 3
+  %cmp = icmp eq i8 %and, 1
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the high end of the TM range.
+define double @f10(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f10:
+; CHECK: tm 4095(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 4095
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the low end of the positive TMY range.
+define double @f11(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f11:
+; CHECK: tmy 4096(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 4096
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the high end of the TMY range.
+define double @f12(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f12:
+; CHECK: tmy 524287(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 524287
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the next byte up, which needs separate address logic.
+define double @f13(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f13:
+; CHECK: agfi %r2, 524288
+; CHECK: tm 0(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 524288
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the low end of the TMY range.
+define double @f14(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f14:
+; CHECK: tmy -524288(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 -524288
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check the next byte down, which needs separate address logic.
+define double @f15(i8 *%src, double %a, double %b) {
+; CHECK-LABEL: f15:
+; CHECK: agfi %r2, -524289
+; CHECK: tm 0(%r2), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 -524289
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
+
+; Check that TM(Y) does not allow an index
+define double @f16(i8 *%src, i64 %index, double %a, double %b) {
+; CHECK-LABEL: f16:
+; CHECK: tm 0({{%r[1-5]}}), 1
+; CHECK: je {{\.L.*}}
+; CHECK: br %r14
+  %ptr = getelementptr i8 *%src, i64 %index
+  %byte = load i8 *%ptr
+  %and = and i8 %byte, 1
+  %cmp = icmp eq i8 %and, 0
+  %res = select i1 %cmp, double %b, double %a
+  ret double %res
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-49.ll b/test/CodeGen/SystemZ/int-cmp-49.ll
new file mode 100644
index 0000000..83f18a2
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-cmp-49.ll
@@ -0,0 +1,49 @@
+; That that we don't try to use z196 instructions on z10 for TMHH and TMHL.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -O0 | FileCheck %s
+
+@g = global i32 0
+
+; Check the lowest useful TMHL value.
+define void @f1(i64 %a) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: risblg
+; CHECK-NOT: risbhg
+; CHECK: tmhl {{%r[0-5]}}, 1
+; CHECK-NOT: risblg
+; CHECK-NOT: risbhg
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 4294967296
+  %cmp = icmp eq i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the lowest useful TMHH value.
+define void @f2(i64 %a) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: risblg
+; CHECK-NOT: risbhg
+; CHECK: tmhh {{%r[0-5]}}, 1
+; CHECK-NOT: risblg
+; CHECK-NOT: risbhg
+; CHECK: br %r14
+entry:
+  %and = and i64 %a, 281474976710656
+  %cmp = icmp ne i64 %and, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 1, i32 *@g
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-const-03.ll b/test/CodeGen/SystemZ/int-const-03.ll
index 78db963..af1cef2 100644
--- a/test/CodeGen/SystemZ/int-const-03.ll
+++ b/test/CodeGen/SystemZ/int-const-03.ll
@@ -139,11 +139,11 @@ define void @f14(i8 *%src) {
   ret void
 }
 
-; Check that MVI does not allow an index
+; Check that MVI does not allow an index.  We prefer STC in that case.
 define void @f15(i64 %src, i64 %index) {
 ; CHECK-LABEL: f15:
-; CHECK: agr %r2, %r3
-; CHECK: mvi 4095(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: stc [[TMP]], 4095({{%r2,%r3|%r3,%r2}}
 ; CHECK: br %r14
   %add1 = add i64 %src, %index
   %add2 = add i64 %add1, 4095
@@ -152,11 +152,11 @@ define void @f15(i64 %src, i64 %index) {
   ret void
 }
 
-; Check that MVIY does not allow an index
+; Check that MVIY does not allow an index.  We prefer STCY in that case.
 define void @f16(i64 %src, i64 %index) {
 ; CHECK-LABEL: f16:
-; CHECK: agr %r2, %r3
-; CHECK: mviy 4096(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: stcy [[TMP]], 4096({{%r2,%r3|%r3,%r2}}
 ; CHECK: br %r14
   %add1 = add i64 %src, %index
   %add2 = add i64 %add1, 4096
diff --git a/test/CodeGen/SystemZ/int-const-04.ll b/test/CodeGen/SystemZ/int-const-04.ll
index c109faa..aced50b 100644
--- a/test/CodeGen/SystemZ/int-const-04.ll
+++ b/test/CodeGen/SystemZ/int-const-04.ll
@@ -75,34 +75,34 @@ define void @f8(i16 *%a) {
   ret void
 }
 
-; Check the next halfword up, which needs separate address logic.
-; Other sequences besides this one would be OK.
+; Check the next halfword up, which is out of range.  We prefer STHY
+; in that case.
 define void @f9(i16 *%a) {
 ; CHECK-LABEL: f9:
-; CHECK: aghi %r2, 4096
-; CHECK: mvhhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: sthy [[TMP]], 4096(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%a, i64 2048
   store i16 42, i16 *%ptr
   ret void
 }
 
-; Check negative displacements, which also need separate address logic.
+; Check negative displacements, for which we again prefer STHY.
 define void @f10(i16 *%a) {
 ; CHECK-LABEL: f10:
-; CHECK: aghi %r2, -2
-; CHECK: mvhhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: sthy [[TMP]], -2(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i16 *%a, i64 -1
   store i16 42, i16 *%ptr
   ret void
 }
 
-; Check that MVHHI does not allow an index
+; Check that MVHHI does not allow an index.
 define void @f11(i64 %src, i64 %index) {
 ; CHECK-LABEL: f11:
-; CHECK: agr %r2, %r3
-; CHECK: mvhhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: sth [[TMP]], 0({{%r2,%r3|%r3,%r2}})
 ; CHECK: br %r14
   %add = add i64 %src, %index
   %ptr = inttoptr i64 %add to i16 *
diff --git a/test/CodeGen/SystemZ/int-const-05.ll b/test/CodeGen/SystemZ/int-const-05.ll
index d0c8569..98d6851 100644
--- a/test/CodeGen/SystemZ/int-const-05.ll
+++ b/test/CodeGen/SystemZ/int-const-05.ll
@@ -66,34 +66,33 @@ define void @f7(i32 *%a) {
   ret void
 }
 
-; Check the next word up, which needs separate address logic.
-; Other sequences besides this one would be OK.
+; Check the next word up, which is out of range.  We prefer STY in that case.
 define void @f8(i32 *%a) {
 ; CHECK-LABEL: f8:
-; CHECK: aghi %r2, 4096
-; CHECK: mvhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: sty [[TMP]], 4096(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%a, i64 1024
   store i32 42, i32 *%ptr
   ret void
 }
 
-; Check negative displacements, which also need separate address logic.
+; Check negative displacements, for which we again prefer STY.
 define void @f9(i32 *%a) {
 ; CHECK-LABEL: f9:
-; CHECK: aghi %r2, -4
-; CHECK: mvhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: sty [[TMP]], -4(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i32 *%a, i64 -1
   store i32 42, i32 *%ptr
   ret void
 }
 
-; Check that MVHI does not allow an index
+; Check that MVHI does not allow an index.
 define void @f10(i64 %src, i64 %index) {
 ; CHECK-LABEL: f10:
-; CHECK: agr %r2, %r3
-; CHECK: mvhi 0(%r2), 42
+; CHECK: lhi [[TMP:%r[0-5]]], 42
+; CHECK: st [[TMP]], 0({{%r2,%r3|%r3,%r2}})
 ; CHECK: br %r14
   %add = add i64 %src, %index
   %ptr = inttoptr i64 %add to i32 *
diff --git a/test/CodeGen/SystemZ/int-const-06.ll b/test/CodeGen/SystemZ/int-const-06.ll
index 12a555c..cf07c66 100644
--- a/test/CodeGen/SystemZ/int-const-06.ll
+++ b/test/CodeGen/SystemZ/int-const-06.ll
@@ -66,34 +66,34 @@ define void @f7(i64 *%a) {
   ret void
 }
 
-; Check the next doubleword up, which needs separate address logic.
-; Other sequences besides this one would be OK.
+; Check the next doubleword up, which is out of range.  We prefer STG
+; in that case.
 define void @f8(i64 *%a) {
 ; CHECK-LABEL: f8:
-; CHECK: aghi %r2, 4096
-; CHECK: mvghi 0(%r2), 42
+; CHECK: lghi [[TMP:%r[0-5]]], 42
+; CHECK: stg [[TMP]], 4096(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%a, i64 512
   store i64 42, i64 *%ptr
   ret void
 }
 
-; Check negative displacements, which also need separate address logic.
+; Check negative displacements, for which we again prefer STG.
 define void @f9(i64 *%a) {
 ; CHECK-LABEL: f9:
-; CHECK: aghi %r2, -8
-; CHECK: mvghi 0(%r2), 42
+; CHECK: lghi [[TMP:%r[0-5]]], 42
+; CHECK: stg [[TMP]], -8(%r2)
 ; CHECK: br %r14
   %ptr = getelementptr i64 *%a, i64 -1
   store i64 42, i64 *%ptr
   ret void
 }
 
-; Check that MVGHI does not allow an index
+; Check that MVGHI does not allow an index.
 define void @f10(i64 %src, i64 %index) {
 ; CHECK-LABEL: f10:
-; CHECK: agr %r2, %r3
-; CHECK: mvghi 0(%r2), 42
+; CHECK: lghi [[TMP:%r[0-5]]], 42
+; CHECK: stg [[TMP]], 0({{%r2,%r3|%r3,%r2}})
 ; CHECK: br %r14
   %add = add i64 %src, %index
   %ptr = inttoptr i64 %add to i64 *
diff --git a/test/CodeGen/SystemZ/int-conv-02.ll b/test/CodeGen/SystemZ/int-conv-02.ll
index 18cfd4a..dd7760d 100644
--- a/test/CodeGen/SystemZ/int-conv-02.ll
+++ b/test/CodeGen/SystemZ/int-conv-02.ll
@@ -1,6 +1,7 @@
-; Test zero extensions from a byte to an i32.
+; Test zero extensions from a byte to an i32.    The tests here
+; assume z10 register pressure, without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Test register extension, starting with an i32.
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/SystemZ/int-conv-06.ll b/test/CodeGen/SystemZ/int-conv-06.ll
index 9c95bad..33860d1 100644
--- a/test/CodeGen/SystemZ/int-conv-06.ll
+++ b/test/CodeGen/SystemZ/int-conv-06.ll
@@ -1,6 +1,7 @@
-; Test zero extensions from a halfword to an i32.
+; Test zero extensions from a halfword to an i32.  The tests here
+; assume z10 register pressure, without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Test register extension, starting with an i32.
 define i32 @f1(i32 %a) {
diff --git a/test/CodeGen/SystemZ/int-conv-09.ll b/test/CodeGen/SystemZ/int-conv-09.ll
index db4c333..b9c5089 100644
--- a/test/CodeGen/SystemZ/int-conv-09.ll
+++ b/test/CodeGen/SystemZ/int-conv-09.ll
@@ -102,80 +102,3 @@ define i64 @f9(i64 %src, i64 %index) {
   %ext = sext i32 %word to i64
   ret i64 %ext
 }
-
-; Test a case where we spill the source of at least one LGFR.  We want
-; to use LGF if possible.
-define void @f10(i64 *%ptr1, i32 *%ptr2) {
-; CHECK-LABEL: f10:
-; CHECK: lgf {{%r[0-9]+}}, 16{{[04]}}(%r15)
-; CHECK: br %r14
-  %val0 = load volatile i32 *%ptr2
-  %val1 = load volatile i32 *%ptr2
-  %val2 = load volatile i32 *%ptr2
-  %val3 = load volatile i32 *%ptr2
-  %val4 = load volatile i32 *%ptr2
-  %val5 = load volatile i32 *%ptr2
-  %val6 = load volatile i32 *%ptr2
-  %val7 = load volatile i32 *%ptr2
-  %val8 = load volatile i32 *%ptr2
-  %val9 = load volatile i32 *%ptr2
-  %val10 = load volatile i32 *%ptr2
-  %val11 = load volatile i32 *%ptr2
-  %val12 = load volatile i32 *%ptr2
-  %val13 = load volatile i32 *%ptr2
-  %val14 = load volatile i32 *%ptr2
-  %val15 = load volatile i32 *%ptr2
-
-  %ext0 = sext i32 %val0 to i64
-  %ext1 = sext i32 %val1 to i64
-  %ext2 = sext i32 %val2 to i64
-  %ext3 = sext i32 %val3 to i64
-  %ext4 = sext i32 %val4 to i64
-  %ext5 = sext i32 %val5 to i64
-  %ext6 = sext i32 %val6 to i64
-  %ext7 = sext i32 %val7 to i64
-  %ext8 = sext i32 %val8 to i64
-  %ext9 = sext i32 %val9 to i64
-  %ext10 = sext i32 %val10 to i64
-  %ext11 = sext i32 %val11 to i64
-  %ext12 = sext i32 %val12 to i64
-  %ext13 = sext i32 %val13 to i64
-  %ext14 = sext i32 %val14 to i64
-  %ext15 = sext i32 %val15 to i64
-
-  store volatile i32 %val0, i32 *%ptr2
-  store volatile i32 %val1, i32 *%ptr2
-  store volatile i32 %val2, i32 *%ptr2
-  store volatile i32 %val3, i32 *%ptr2
-  store volatile i32 %val4, i32 *%ptr2
-  store volatile i32 %val5, i32 *%ptr2
-  store volatile i32 %val6, i32 *%ptr2
-  store volatile i32 %val7, i32 *%ptr2
-  store volatile i32 %val8, i32 *%ptr2
-  store volatile i32 %val9, i32 *%ptr2
-  store volatile i32 %val10, i32 *%ptr2
-  store volatile i32 %val11, i32 *%ptr2
-  store volatile i32 %val12, i32 *%ptr2
-  store volatile i32 %val13, i32 *%ptr2
-  store volatile i32 %val14, i32 *%ptr2
-  store volatile i32 %val15, i32 *%ptr2
-
-  store volatile i64 %ext0, i64 *%ptr1
-  store volatile i64 %ext1, i64 *%ptr1
-  store volatile i64 %ext2, i64 *%ptr1
-  store volatile i64 %ext3, i64 *%ptr1
-  store volatile i64 %ext4, i64 *%ptr1
-  store volatile i64 %ext5, i64 *%ptr1
-  store volatile i64 %ext6, i64 *%ptr1
-  store volatile i64 %ext7, i64 *%ptr1
-  store volatile i64 %ext8, i64 *%ptr1
-  store volatile i64 %ext9, i64 *%ptr1
-  store volatile i64 %ext10, i64 *%ptr1
-  store volatile i64 %ext11, i64 *%ptr1
-  store volatile i64 %ext12, i64 *%ptr1
-  store volatile i64 %ext13, i64 *%ptr1
-  store volatile i64 %ext14, i64 *%ptr1
-  store volatile i64 %ext15, i64 *%ptr1
-
-  ret void
-}
diff --git a/test/CodeGen/SystemZ/int-conv-10.ll b/test/CodeGen/SystemZ/int-conv-10.ll
index f2f71d9..781c74c 100644
--- a/test/CodeGen/SystemZ/int-conv-10.ll
+++ b/test/CodeGen/SystemZ/int-conv-10.ll
@@ -111,80 +111,3 @@ define i64 @f10(i64 %src, i64 %index) {
   %ext = zext i32 %word to i64
   ret i64 %ext
 }
-
-; Test a case where we spill the source of at least one LLGFR.  We want
-; to use LLGF if possible.
-define void @f11(i64 *%ptr1, i32 *%ptr2) {
-; CHECK-LABEL: f11:
-; CHECK: llgf {{%r[0-9]+}}, 16{{[04]}}(%r15)
-; CHECK: br %r14
-  %val0 = load volatile i32 *%ptr2
-  %val1 = load volatile i32 *%ptr2
-  %val2 = load volatile i32 *%ptr2
-  %val3 = load volatile i32 *%ptr2
-  %val4 = load volatile i32 *%ptr2
-  %val5 = load volatile i32 *%ptr2
-  %val6 = load volatile i32 *%ptr2
-  %val7 = load volatile i32 *%ptr2
-  %val8 = load volatile i32 *%ptr2
-  %val9 = load volatile i32 *%ptr2
-  %val10 = load volatile i32 *%ptr2
-  %val11 = load volatile i32 *%ptr2
-  %val12 = load volatile i32 *%ptr2
-  %val13 = load volatile i32 *%ptr2
-  %val14 = load volatile i32 *%ptr2
-  %val15 = load volatile i32 *%ptr2
-
-  %ext0 = zext i32 %val0 to i64
-  %ext1 = zext i32 %val1 to i64
-  %ext2 = zext i32 %val2 to i64
-  %ext3 = zext i32 %val3 to i64
-  %ext4 = zext i32 %val4 to i64
-  %ext5 = zext i32 %val5 to i64
-  %ext6 = zext i32 %val6 to i64
-  %ext7 = zext i32 %val7 to i64
-  %ext8 = zext i32 %val8 to i64
-  %ext9 = zext i32 %val9 to i64
-  %ext10 = zext i32 %val10 to i64
-  %ext11 = zext i32 %val11 to i64
-  %ext12 = zext i32 %val12 to i64
-  %ext13 = zext i32 %val13 to i64
-  %ext14 = zext i32 %val14 to i64
-  %ext15 = zext i32 %val15 to i64
-
-  store volatile i32 %val0, i32 *%ptr2
-  store volatile i32 %val1, i32 *%ptr2
-  store volatile i32 %val2, i32 *%ptr2
-  store volatile i32 %val3, i32 *%ptr2
-  store volatile i32 %val4, i32 *%ptr2
-  store volatile i32 %val5, i32 *%ptr2
-  store volatile i32 %val6, i32 *%ptr2
-  store volatile i32 %val7, i32 *%ptr2
-  store volatile i32 %val8, i32 *%ptr2
-  store volatile i32 %val9, i32 *%ptr2
-  store volatile i32 %val10, i32 *%ptr2
-  store volatile i32 %val11, i32 *%ptr2
-  store volatile i32 %val12, i32 *%ptr2
-  store volatile i32 %val13, i32 *%ptr2
-  store volatile i32 %val14, i32 *%ptr2
-  store volatile i32 %val15, i32 *%ptr2
-
-  store volatile i64 %ext0, i64 *%ptr1
-  store volatile i64 %ext1, i64 *%ptr1
-  store volatile i64 %ext2, i64 *%ptr1
-  store volatile i64 %ext3, i64 *%ptr1
-  store volatile i64 %ext4, i64 *%ptr1
-  store volatile i64 %ext5, i64 *%ptr1
-  store volatile i64 %ext6, i64 *%ptr1
-  store volatile i64 %ext7, i64 *%ptr1
-  store volatile i64 %ext8, i64 *%ptr1
-  store volatile i64 %ext9, i64 *%ptr1
-  store volatile i64 %ext10, i64 *%ptr1
-  store volatile i64 %ext11, i64 *%ptr1
-  store volatile i64 %ext12, i64 *%ptr1
-  store volatile i64 %ext13, i64 *%ptr1
-  store volatile i64 %ext14, i64 *%ptr1
-  store volatile i64 %ext15, i64 *%ptr1
-
-  ret void
-}
diff --git a/test/CodeGen/SystemZ/int-conv-11.ll b/test/CodeGen/SystemZ/int-conv-11.ll
new file mode 100644
index 0000000..3076962
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-conv-11.ll
@@ -0,0 +1,350 @@
+; Test spills of zero extensions when high GR32s are available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | FileCheck %s
+
+; Test a case where we spill the source of at least one LLCRMux.  We want
+; to use LLC(H) if possible.
+define void @f1(i32 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK: llc{{h?}} {{%r[0-9]+}}, 16{{[37]}}(%r15)
+; CHECK: br %r14
+  %val0 = load volatile i32 *%ptr
+  %val1 = load volatile i32 *%ptr
+  %val2 = load volatile i32 *%ptr
+  %val3 = load volatile i32 *%ptr
+  %val4 = load volatile i32 *%ptr
+  %val5 = load volatile i32 *%ptr
+  %val6 = load volatile i32 *%ptr
+  %val7 = load volatile i32 *%ptr
+  %val8 = load volatile i32 *%ptr
+  %val9 = load volatile i32 *%ptr
+  %val10 = load volatile i32 *%ptr
+  %val11 = load volatile i32 *%ptr
+  %val12 = load volatile i32 *%ptr
+  %val13 = load volatile i32 *%ptr
+  %val14 = load volatile i32 *%ptr
+  %val15 = load volatile i32 *%ptr
+  %val16 = load volatile i32 *%ptr
+  %val17 = load volatile i32 *%ptr
+  %val18 = load volatile i32 *%ptr
+  %val19 = load volatile i32 *%ptr
+  %val20 = load volatile i32 *%ptr
+  %val21 = load volatile i32 *%ptr
+  %val22 = load volatile i32 *%ptr
+  %val23 = load volatile i32 *%ptr
+  %val24 = load volatile i32 *%ptr
+  %val25 = load volatile i32 *%ptr
+  %val26 = load volatile i32 *%ptr
+  %val27 = load volatile i32 *%ptr
+  %val28 = load volatile i32 *%ptr
+  %val29 = load volatile i32 *%ptr
+  %val30 = load volatile i32 *%ptr
+  %val31 = load volatile i32 *%ptr
+
+  %trunc0 = trunc i32 %val0 to i8
+  %trunc1 = trunc i32 %val1 to i8
+  %trunc2 = trunc i32 %val2 to i8
+  %trunc3 = trunc i32 %val3 to i8
+  %trunc4 = trunc i32 %val4 to i8
+  %trunc5 = trunc i32 %val5 to i8
+  %trunc6 = trunc i32 %val6 to i8
+  %trunc7 = trunc i32 %val7 to i8
+  %trunc8 = trunc i32 %val8 to i8
+  %trunc9 = trunc i32 %val9 to i8
+  %trunc10 = trunc i32 %val10 to i8
+  %trunc11 = trunc i32 %val11 to i8
+  %trunc12 = trunc i32 %val12 to i8
+  %trunc13 = trunc i32 %val13 to i8
+  %trunc14 = trunc i32 %val14 to i8
+  %trunc15 = trunc i32 %val15 to i8
+  %trunc16 = trunc i32 %val16 to i8
+  %trunc17 = trunc i32 %val17 to i8
+  %trunc18 = trunc i32 %val18 to i8
+  %trunc19 = trunc i32 %val19 to i8
+  %trunc20 = trunc i32 %val20 to i8
+  %trunc21 = trunc i32 %val21 to i8
+  %trunc22 = trunc i32 %val22 to i8
+  %trunc23 = trunc i32 %val23 to i8
+  %trunc24 = trunc i32 %val24 to i8
+  %trunc25 = trunc i32 %val25 to i8
+  %trunc26 = trunc i32 %val26 to i8
+  %trunc27 = trunc i32 %val27 to i8
+  %trunc28 = trunc i32 %val28 to i8
+  %trunc29 = trunc i32 %val29 to i8
+  %trunc30 = trunc i32 %val30 to i8
+  %trunc31 = trunc i32 %val31 to i8
+
+  %ext0 = zext i8 %trunc0 to i32
+  %ext1 = zext i8 %trunc1 to i32
+  %ext2 = zext i8 %trunc2 to i32
+  %ext3 = zext i8 %trunc3 to i32
+  %ext4 = zext i8 %trunc4 to i32
+  %ext5 = zext i8 %trunc5 to i32
+  %ext6 = zext i8 %trunc6 to i32
+  %ext7 = zext i8 %trunc7 to i32
+  %ext8 = zext i8 %trunc8 to i32
+  %ext9 = zext i8 %trunc9 to i32
+  %ext10 = zext i8 %trunc10 to i32
+  %ext11 = zext i8 %trunc11 to i32
+  %ext12 = zext i8 %trunc12 to i32
+  %ext13 = zext i8 %trunc13 to i32
+  %ext14 = zext i8 %trunc14 to i32
+  %ext15 = zext i8 %trunc15 to i32
+  %ext16 = zext i8 %trunc16 to i32
+  %ext17 = zext i8 %trunc17 to i32
+  %ext18 = zext i8 %trunc18 to i32
+  %ext19 = zext i8 %trunc19 to i32
+  %ext20 = zext i8 %trunc20 to i32
+  %ext21 = zext i8 %trunc21 to i32
+  %ext22 = zext i8 %trunc22 to i32
+  %ext23 = zext i8 %trunc23 to i32
+  %ext24 = zext i8 %trunc24 to i32
+  %ext25 = zext i8 %trunc25 to i32
+  %ext26 = zext i8 %trunc26 to i32
+  %ext27 = zext i8 %trunc27 to i32
+  %ext28 = zext i8 %trunc28 to i32
+  %ext29 = zext i8 %trunc29 to i32
+  %ext30 = zext i8 %trunc30 to i32
+  %ext31 = zext i8 %trunc31 to i32
+
+  store volatile i32 %val0, i32 *%ptr
+  store volatile i32 %val1, i32 *%ptr
+  store volatile i32 %val2, i32 *%ptr
+  store volatile i32 %val3, i32 *%ptr
+  store volatile i32 %val4, i32 *%ptr
+  store volatile i32 %val5, i32 *%ptr
+  store volatile i32 %val6, i32 *%ptr
+  store volatile i32 %val7, i32 *%ptr
+  store volatile i32 %val8, i32 *%ptr
+  store volatile i32 %val9, i32 *%ptr
+  store volatile i32 %val10, i32 *%ptr
+  store volatile i32 %val11, i32 *%ptr
+  store volatile i32 %val12, i32 *%ptr
+  store volatile i32 %val13, i32 *%ptr
+  store volatile i32 %val14, i32 *%ptr
+  store volatile i32 %val15, i32 *%ptr
+  store volatile i32 %val16, i32 *%ptr
+  store volatile i32 %val17, i32 *%ptr
+  store volatile i32 %val18, i32 *%ptr
+  store volatile i32 %val19, i32 *%ptr
+  store volatile i32 %val20, i32 *%ptr
+  store volatile i32 %val21, i32 *%ptr
+  store volatile i32 %val22, i32 *%ptr
+  store volatile i32 %val23, i32 *%ptr
+  store volatile i32 %val24, i32 *%ptr
+  store volatile i32 %val25, i32 *%ptr
+  store volatile i32 %val26, i32 *%ptr
+  store volatile i32 %val27, i32 *%ptr
+  store volatile i32 %val28, i32 *%ptr
+  store volatile i32 %val29, i32 *%ptr
+  store volatile i32 %val30, i32 *%ptr
+  store volatile i32 %val31, i32 *%ptr
+
+  store volatile i32 %ext0, i32 *%ptr
+  store volatile i32 %ext1, i32 *%ptr
+  store volatile i32 %ext2, i32 *%ptr
+  store volatile i32 %ext3, i32 *%ptr
+  store volatile i32 %ext4, i32 *%ptr
+  store volatile i32 %ext5, i32 *%ptr
+  store volatile i32 %ext6, i32 *%ptr
+  store volatile i32 %ext7, i32 *%ptr
+  store volatile i32 %ext8, i32 *%ptr
+  store volatile i32 %ext9, i32 *%ptr
+  store volatile i32 %ext10, i32 *%ptr
+  store volatile i32 %ext11, i32 *%ptr
+  store volatile i32 %ext12, i32 *%ptr
+  store volatile i32 %ext13, i32 *%ptr
+  store volatile i32 %ext14, i32 *%ptr
+  store volatile i32 %ext15, i32 *%ptr
+  store volatile i32 %ext16, i32 *%ptr
+  store volatile i32 %ext17, i32 *%ptr
+  store volatile i32 %ext18, i32 *%ptr
+  store volatile i32 %ext19, i32 *%ptr
+  store volatile i32 %ext20, i32 *%ptr
+  store volatile i32 %ext21, i32 *%ptr
+  store volatile i32 %ext22, i32 *%ptr
+  store volatile i32 %ext23, i32 *%ptr
+  store volatile i32 %ext24, i32 *%ptr
+  store volatile i32 %ext25, i32 *%ptr
+  store volatile i32 %ext26, i32 *%ptr
+  store volatile i32 %ext27, i32 *%ptr
+  store volatile i32 %ext28, i32 *%ptr
+  store volatile i32 %ext29, i32 *%ptr
+  store volatile i32 %ext30, i32 *%ptr
+  store volatile i32 %ext31, i32 *%ptr
+
+  ret void
+}
+
+; Same again with i16, which should use LLH(H).
+define void @f2(i32 *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK: llh{{h?}} {{%r[0-9]+}}, 16{{[26]}}(%r15)
+; CHECK: br %r14
+  %val0 = load volatile i32 *%ptr
+  %val1 = load volatile i32 *%ptr
+  %val2 = load volatile i32 *%ptr
+  %val3 = load volatile i32 *%ptr
+  %val4 = load volatile i32 *%ptr
+  %val5 = load volatile i32 *%ptr
+  %val6 = load volatile i32 *%ptr
+  %val7 = load volatile i32 *%ptr
+  %val8 = load volatile i32 *%ptr
+  %val9 = load volatile i32 *%ptr
+  %val10 = load volatile i32 *%ptr
+  %val11 = load volatile i32 *%ptr
+  %val12 = load volatile i32 *%ptr
+  %val13 = load volatile i32 *%ptr
+  %val14 = load volatile i32 *%ptr
+  %val15 = load volatile i32 *%ptr
+  %val16 = load volatile i32 *%ptr
+  %val17 = load volatile i32 *%ptr
+  %val18 = load volatile i32 *%ptr
+  %val19 = load volatile i32 *%ptr
+  %val20 = load volatile i32 *%ptr
+  %val21 = load volatile i32 *%ptr
+  %val22 = load volatile i32 *%ptr
+  %val23 = load volatile i32 *%ptr
+  %val24 = load volatile i32 *%ptr
+  %val25 = load volatile i32 *%ptr
+  %val26 = load volatile i32 *%ptr
+  %val27 = load volatile i32 *%ptr
+  %val28 = load volatile i32 *%ptr
+  %val29 = load volatile i32 *%ptr
+  %val30 = load volatile i32 *%ptr
+  %val31 = load volatile i32 *%ptr
+
+  %trunc0 = trunc i32 %val0 to i16
+  %trunc1 = trunc i32 %val1 to i16
+  %trunc2 = trunc i32 %val2 to i16
+  %trunc3 = trunc i32 %val3 to i16
+  %trunc4 = trunc i32 %val4 to i16
+  %trunc5 = trunc i32 %val5 to i16
+  %trunc6 = trunc i32 %val6 to i16
+  %trunc7 = trunc i32 %val7 to i16
+  %trunc8 = trunc i32 %val8 to i16
+  %trunc9 = trunc i32 %val9 to i16
+  %trunc10 = trunc i32 %val10 to i16
+  %trunc11 = trunc i32 %val11 to i16
+  %trunc12 = trunc i32 %val12 to i16
+  %trunc13 = trunc i32 %val13 to i16
+  %trunc14 = trunc i32 %val14 to i16
+  %trunc15 = trunc i32 %val15 to i16
+  %trunc16 = trunc i32 %val16 to i16
+  %trunc17 = trunc i32 %val17 to i16
+  %trunc18 = trunc i32 %val18 to i16
+  %trunc19 = trunc i32 %val19 to i16
+  %trunc20 = trunc i32 %val20 to i16
+  %trunc21 = trunc i32 %val21 to i16
+  %trunc22 = trunc i32 %val22 to i16
+  %trunc23 = trunc i32 %val23 to i16
+  %trunc24 = trunc i32 %val24 to i16
+  %trunc25 = trunc i32 %val25 to i16
+  %trunc26 = trunc i32 %val26 to i16
+  %trunc27 = trunc i32 %val27 to i16
+  %trunc28 = trunc i32 %val28 to i16
+  %trunc29 = trunc i32 %val29 to i16
+  %trunc30 = trunc i32 %val30 to i16
+  %trunc31 = trunc i32 %val31 to i16
+
+  %ext0 = zext i16 %trunc0 to i32
+  %ext1 = zext i16 %trunc1 to i32
+  %ext2 = zext i16 %trunc2 to i32
+  %ext3 = zext i16 %trunc3 to i32
+  %ext4 = zext i16 %trunc4 to i32
+  %ext5 = zext i16 %trunc5 to i32
+  %ext6 = zext i16 %trunc6 to i32
+  %ext7 = zext i16 %trunc7 to i32
+  %ext8 = zext i16 %trunc8 to i32
+  %ext9 = zext i16 %trunc9 to i32
+  %ext10 = zext i16 %trunc10 to i32
+  %ext11 = zext i16 %trunc11 to i32
+  %ext12 = zext i16 %trunc12 to i32
+  %ext13 = zext i16 %trunc13 to i32
+  %ext14 = zext i16 %trunc14 to i32
+  %ext15 = zext i16 %trunc15 to i32
+  %ext16 = zext i16 %trunc16 to i32
+  %ext17 = zext i16 %trunc17 to i32
+  %ext18 = zext i16 %trunc18 to i32
+  %ext19 = zext i16 %trunc19 to i32
+  %ext20 = zext i16 %trunc20 to i32
+  %ext21 = zext i16 %trunc21 to i32
+  %ext22 = zext i16 %trunc22 to i32
+  %ext23 = zext i16 %trunc23 to i32
+  %ext24 = zext i16 %trunc24 to i32
+  %ext25 = zext i16 %trunc25 to i32
+  %ext26 = zext i16 %trunc26 to i32
+  %ext27 = zext i16 %trunc27 to i32
+  %ext28 = zext i16 %trunc28 to i32
+  %ext29 = zext i16 %trunc29 to i32
+  %ext30 = zext i16 %trunc30 to i32
+  %ext31 = zext i16 %trunc31 to i32
+
+  store volatile i32 %val0, i32 *%ptr
+  store volatile i32 %val1, i32 *%ptr
+  store volatile i32 %val2, i32 *%ptr
+  store volatile i32 %val3, i32 *%ptr
+  store volatile i32 %val4, i32 *%ptr
+  store volatile i32 %val5, i32 *%ptr
+  store volatile i32 %val6, i32 *%ptr
+  store volatile i32 %val7, i32 *%ptr
+  store volatile i32 %val8, i32 *%ptr
+  store volatile i32 %val9, i32 *%ptr
+  store volatile i32 %val10, i32 *%ptr
+  store volatile i32 %val11, i32 *%ptr
+  store volatile i32 %val12, i32 *%ptr
+  store volatile i32 %val13, i32 *%ptr
+  store volatile i32 %val14, i32 *%ptr
+  store volatile i32 %val15, i32 *%ptr
+  store volatile i32 %val16, i32 *%ptr
+  store volatile i32 %val17, i32 *%ptr
+  store volatile i32 %val18, i32 *%ptr
+  store volatile i32 %val19, i32 *%ptr
+  store volatile i32 %val20, i32 *%ptr
+  store volatile i32 %val21, i32 *%ptr
+  store volatile i32 %val22, i32 *%ptr
+  store volatile i32 %val23, i32 *%ptr
+  store volatile i32 %val24, i32 *%ptr
+  store volatile i32 %val25, i32 *%ptr
+  store volatile i32 %val26, i32 *%ptr
+  store volatile i32 %val27, i32 *%ptr
+  store volatile i32 %val28, i32 *%ptr
+  store volatile i32 %val29, i32 *%ptr
+  store volatile i32 %val30, i32 *%ptr
+  store volatile i32 %val31, i32 *%ptr
+
+  store volatile i32 %ext0, i32 *%ptr
+  store volatile i32 %ext1, i32 *%ptr
+  store volatile i32 %ext2, i32 *%ptr
+  store volatile i32 %ext3, i32 *%ptr
+  store volatile i32 %ext4, i32 *%ptr
+  store volatile i32 %ext5, i32 *%ptr
+  store volatile i32 %ext6, i32 *%ptr
+  store volatile i32 %ext7, i32 *%ptr
+  store volatile i32 %ext8, i32 *%ptr
+  store volatile i32 %ext9, i32 *%ptr
+  store volatile i32 %ext10, i32 *%ptr
+  store volatile i32 %ext11, i32 *%ptr
+  store volatile i32 %ext12, i32 *%ptr
+  store volatile i32 %ext13, i32 *%ptr
+  store volatile i32 %ext14, i32 *%ptr
+  store volatile i32 %ext15, i32 *%ptr
+  store volatile i32 %ext16, i32 *%ptr
+  store volatile i32 %ext17, i32 *%ptr
+  store volatile i32 %ext18, i32 *%ptr
+  store volatile i32 %ext19, i32 *%ptr
+  store volatile i32 %ext20, i32 *%ptr
+  store volatile i32 %ext21, i32 *%ptr
+  store volatile i32 %ext22, i32 *%ptr
+  store volatile i32 %ext23, i32 *%ptr
+  store volatile i32 %ext24, i32 *%ptr
+  store volatile i32 %ext25, i32 *%ptr
+  store volatile i32 %ext26, i32 *%ptr
+  store volatile i32 %ext27, i32 *%ptr
+  store volatile i32 %ext28, i32 *%ptr
+  store volatile i32 %ext29, i32 *%ptr
+  store volatile i32 %ext30, i32 *%ptr
+  store volatile i32 %ext31, i32 *%ptr
+
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-div-06.ll b/test/CodeGen/SystemZ/int-div-06.ll
new file mode 100644
index 0000000..8576b1b
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-div-06.ll
@@ -0,0 +1,56 @@
+; Test that divisions by constants are implemented as multiplications.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Check signed 32-bit division.
+define i32 @f1(i32 %a) {
+; CHECK-LABEL: f1:
+; CHECK: lgfr [[REG:%r[0-5]]], %r2
+; CHECK: msgfi [[REG]], 502748801
+; CHECK-DAG: srlg [[RES1:%r[0-5]]], [[REG]], 63
+; CHECK-DAG: srag %r2, [[REG]], 46
+; CHECK: ar %r2, [[RES1]]
+; CHECK: br %r14
+  %b = sdiv i32 %a, 139968
+  ret i32 %b
+}
+
+; Check unsigned 32-bit division.
+define i32 @f2(i32 %a) {
+; CHECK-LABEL: f2:
+; CHECK: llgfr [[REG:%r[0-5]]], %r2
+; CHECK: msgfi [[REG]], 502748801
+; CHECK: srlg %r2, [[REG]], 46
+; CHECK: br %r14
+  %b = udiv i32 %a, 139968
+  ret i32 %b
+}
+
+; Check signed 64-bit division.
+define i64 @f3(i64 %dummy, i64 %a) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: llihf [[CONST:%r[0-5]]], 1005497601
+; CHECK-DAG: oilf [[CONST]], 4251762321
+; CHECK-DAG: srag [[REG:%r[0-5]]], %r3, 63
+; CHECK-DAG: ngr [[REG]], [[CONST]]
+; CHECK-DAG: mlgr %r2, [[CONST]]
+; CHECK: sgr %r2, [[REG]]
+; CHECK: srlg [[RES1:%r[0-5]]], %r2, 63
+; CHECK: srag %r2, %r2, 15
+; CHECK: agr %r2, [[RES1]]
+; CHECK: br %r14
+  %b = sdiv i64 %a, 139968
+  ret i64 %b
+}
+
+; Check unsigned 64-bit division.
+define i64 @f4(i64 %dummy, i64 %a) {
+; CHECK-LABEL: f4:
+; CHECK: llihf [[CONST:%r[0-5]]], 1005497601
+; CHECK: oilf [[CONST]], 4251762321
+; CHECK: mlgr %r2, [[CONST]]
+; CHECK: srlg %r2, %r2, 15
+; CHECK: br %r14
+  %b = udiv i64 %a, 139968
+  ret i64 %b
+}
diff --git a/test/CodeGen/SystemZ/int-move-08.ll b/test/CodeGen/SystemZ/int-move-08.ll
index f16dd8e..56fcbc6 100644
--- a/test/CodeGen/SystemZ/int-move-08.ll
+++ b/test/CodeGen/SystemZ/int-move-08.ll
@@ -10,6 +10,8 @@
 @gsrc32u = global i32 1, align 2, section "foo"
 @gdst16u = global i16 2, align 1, section "foo"
 @gdst32u = global i32 2, align 2, section "foo"
+@garray8 = global [2 x i8] [i8 100, i8 101]
+@garray16 = global [2 x i16] [i16 102, i16 103]
 
 ; Check sign-extending loads from i16.
 define i32 @f1() {
@@ -97,3 +99,36 @@ define void @f8() {
   store i32 %val, i32 *@gdst32u, align 2
   ret void
 }
+
+; Test a case where we want to use one LARL for accesses to two different
+; parts of a variable.
+define void @f9() {
+; CHECK-LABEL: f9:
+; CHECK: larl [[REG:%r[0-5]]], garray8
+; CHECK: llc [[VAL:%r[0-5]]], 0([[REG]])
+; CHECK: srl [[VAL]], 1
+; CHECK: stc [[VAL]], 1([[REG]])
+; CHECK: br %r14
+  %ptr1 = getelementptr [2 x i8] *@garray8, i64 0, i64 0
+  %ptr2 = getelementptr [2 x i8] *@garray8, i64 0, i64 1
+  %val = load i8 *%ptr1
+  %shr = lshr i8 %val, 1
+  store i8 %shr, i8 *%ptr2
+  ret void
+}
+
+; Test a case where we want to use separate relative-long addresses for
+; two different parts of a variable.
+define void @f10() {
+; CHECK-LABEL: f10:
+; CHECK: llhrl [[VAL:%r[0-5]]], garray16
+; CHECK: srl [[VAL]], 1
+; CHECK: sthrl [[VAL]], garray16+2
+; CHECK: br %r14
+  %ptr1 = getelementptr [2 x i16] *@garray16, i64 0, i64 0
+  %ptr2 = getelementptr [2 x i16] *@garray16, i64 0, i64 1
+  %val = load i16 *%ptr1
+  %shr = lshr i16 %val, 1
+  store i16 %shr, i16 *%ptr2
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/int-mul-08.ll b/test/CodeGen/SystemZ/int-mul-08.ll
index a245760..90b26a4 100644
--- a/test/CodeGen/SystemZ/int-mul-08.ll
+++ b/test/CodeGen/SystemZ/int-mul-08.ll
@@ -22,9 +22,13 @@ define i64 @f1(i64 %dummy, i64 %a, i64 %b) {
 ; This needs a rather convoluted sequence.
 define i64 @f2(i64 %dummy, i64 %a, i64 %b) {
 ; CHECK-LABEL: f2:
-; CHECK: mlgr
-; CHECK: agr
-; CHECK: agr
+; CHECK-DAG: srag [[RES1:%r[0-5]]], %r3, 63
+; CHECK-DAG: srag [[RES2:%r[0-5]]], %r4, 63
+; CHECK-DAG: ngr [[RES1]], %r4
+; CHECK-DAG: ngr [[RES2]], %r3
+; CHECK-DAG: agr [[RES2]], [[RES1]]
+; CHECK-DAG: mlgr %r2, %r4
+; CHECK: sgr %r2, [[RES2]]
 ; CHECK: br %r14
   %ax = sext i64 %a to i128
   %bx = sext i64 %b to i128
diff --git a/test/CodeGen/SystemZ/int-neg-02.ll b/test/CodeGen/SystemZ/int-neg-02.ll
new file mode 100644
index 0000000..e26194c
--- /dev/null
+++ b/test/CodeGen/SystemZ/int-neg-02.ll
@@ -0,0 +1,91 @@
+; Test negative integer absolute.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test i32->i32 negative absolute using slt.
+define i32 @f1(i32 %val) {
+; CHECK-LABEL: f1:
+; CHECK: lnr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp slt i32 %val, 0
+  %neg = sub i32 0, %val
+  %abs = select i1 %cmp, i32 %neg, i32 %val
+  %res = sub i32 0, %abs
+  ret i32 %res
+}
+
+; Test i32->i32 negative absolute using sle.
+define i32 @f2(i32 %val) {
+; CHECK-LABEL: f2:
+; CHECK: lnr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sle i32 %val, 0
+  %neg = sub i32 0, %val
+  %abs = select i1 %cmp, i32 %neg, i32 %val
+  %res = sub i32 0, %abs
+  ret i32 %res
+}
+
+; Test i32->i32 negative absolute using sgt.
+define i32 @f3(i32 %val) {
+; CHECK-LABEL: f3:
+; CHECK: lnr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sgt i32 %val, 0
+  %neg = sub i32 0, %val
+  %abs = select i1 %cmp, i32 %val, i32 %neg
+  %res = sub i32 0, %abs
+  ret i32 %res
+}
+
+; Test i32->i32 negative absolute using sge.
+define i32 @f4(i32 %val) {
+; CHECK-LABEL: f4:
+; CHECK: lnr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp sge i32 %val, 0
+  %neg = sub i32 0, %val
+  %abs = select i1 %cmp, i32 %val, i32 %neg
+  %res = sub i32 0, %abs
+  ret i32 %res
+}
+
+; Test i32->i64 negative absolute.
+define i64 @f5(i32 %val) {
+; CHECK-LABEL: f5:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %ext = sext i32 %val to i64
+  %cmp = icmp slt i64 %ext, 0
+  %neg = sub i64 0, %ext
+  %abs = select i1 %cmp, i64 %neg, i64 %ext
+  %res = sub i64 0, %abs
+  ret i64 %res
+}
+
+; Test i32->i64 negative absolute that uses an "in-register" form of
+; sign extension.
+define i64 @f6(i64 %val) {
+; CHECK-LABEL: f6:
+; CHECK: lngfr %r2, %r2
+; CHECK: br %r14
+  %trunc = trunc i64 %val to i32
+  %ext = sext i32 %trunc to i64
+  %cmp = icmp slt i64 %ext, 0
+  %neg = sub i64 0, %ext
+  %abs = select i1 %cmp, i64 %neg, i64 %ext
+  %res = sub i64 0, %abs
+  ret i64 %res
+}
+
+; Test i64 negative absolute.
+define i64 @f7(i64 %val) {
+; CHECK-LABEL: f7:
+; CHECK: lngr %r2, %r2
+; CHECK: br %r14
+  %cmp = icmp slt i64 %val, 0
+  %neg = sub i64 0, %val
+  %abs = select i1 %cmp, i64 %neg, i64 %val
+  %res = sub i64 0, %abs
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/lit.local.cfg b/test/CodeGen/SystemZ/lit.local.cfg
index 79528d1..b12af09 100644
--- a/test/CodeGen/SystemZ/lit.local.cfg
+++ b/test/CodeGen/SystemZ/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'SystemZ' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/SystemZ/memchr-01.ll b/test/CodeGen/SystemZ/memchr-01.ll
new file mode 100644
index 0000000..c51690b
--- /dev/null
+++ b/test/CodeGen/SystemZ/memchr-01.ll
@@ -0,0 +1,21 @@
+; Test memchr using SRST, with a weird but usable prototype.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i8 *@memchr(i8 *%src, i16 %char, i32 %len)
+
+; Test a simple forwarded call.
+define i8 *@f1(i8 *%src, i16 %char, i32 %len) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lgr [[REG:%r[1-5]]], %r2
+; CHECK-DAG: algfr %r2, %r4
+; CHECK-DAG: llcr %r0, %r3
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: srst %r2, [[REG]]
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK: jl {{\.L.*}}
+; CHECK: lghi %r2, 0
+; CHECK: br %r14
+  %res = call i8 *@memchr(i8 *%src, i16 %char, i32 %len)
+  ret i8 *%res
+}
diff --git a/test/CodeGen/SystemZ/memchr-02.ll b/test/CodeGen/SystemZ/memchr-02.ll
new file mode 100644
index 0000000..982b396
--- /dev/null
+++ b/test/CodeGen/SystemZ/memchr-02.ll
@@ -0,0 +1,57 @@
+; Test memchr using SRST, with the correct prototype.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i8 *@memchr(i8 *%src, i32 %char, i64 %len)
+
+; Test a simple forwarded call.
+define i8 *@f1(i64 %len, i8 *%src, i32 %char) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: agr %r2, %r3
+; CHECK-DAG: llcr %r0, %r4
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: srst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK: jl {{\.L.*}}
+; CHECK: lghi %r2, 0
+; CHECK: br %r14
+  %res = call i8 *@memchr(i8 *%src, i32 %char, i64 %len)
+  ret i8 *%res
+}
+
+; Test a doubled call with no use of %r0 in between.  There should be a
+; single load of %r0.
+define i8 *@f2(i8 *%src, i8 *%charptr, i64 %len) {
+; CHECK-LABEL: f2:
+; CHECK: llc %r0, 0(%r3)
+; CHECK-NOT: %r0
+; CHECK: srst [[RES1:%r[1-5]]], %r2
+; CHECK-NOT: %r0
+; CHECK: srst %r2, [[RES1]]
+; CHECK: br %r14
+  %char = load volatile i8 *%charptr
+  %charext = zext i8 %char to i32
+  %res1 = call i8 *@memchr(i8 *%src, i32 %charext, i64 %len)
+  %res2 = call i8 *@memchr(i8 *%res1, i32 %charext, i64 %len)
+  ret i8 *%res2
+}
+
+; Test a doubled call with a use of %r0 in between.  %r0 must be loaded
+; for each loop.
+define i8 *@f3(i8 *%src, i8 *%charptr, i64 %len) {
+; CHECK-LABEL: f3:
+; CHECK: llc [[CHAR:%r[1-5]]], 0(%r3)
+; CHECK: lr %r0, [[CHAR]]
+; CHECK: srst [[RES1:%r[1-5]]], %r2
+; CHECK: lhi %r0, 0
+; CHECK: blah %r0
+; CHECK: lr %r0, [[CHAR]]
+; CHECK: srst %r2, [[RES1]]
+; CHECK: br %r14
+  %char = load volatile i8 *%charptr
+  %charext = zext i8 %char to i32
+  %res1 = call i8 *@memchr(i8 *%src, i32 %charext, i64 %len)
+  call void asm sideeffect "blah $0", "{r0}" (i32 0)
+  %res2 = call i8 *@memchr(i8 *%res1, i32 %charext, i64 %len)
+  ret i8 *%res2
+}
diff --git a/test/CodeGen/SystemZ/memcmp-01.ll b/test/CodeGen/SystemZ/memcmp-01.ll
new file mode 100644
index 0000000..a014419
--- /dev/null
+++ b/test/CodeGen/SystemZ/memcmp-01.ll
@@ -0,0 +1,221 @@
+; Test memcmp using CLC, with i32 results.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare signext i32 @memcmp(i8 *%src1, i8 *%src2, i64 %size)
+
+; Zero-length comparisons should be optimized away.
+define i32 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK: lhi %r2, 0
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 0)
+  ret i32 %res
+}
+
+; Check a case where the result is used as an integer.
+define i32 @f2(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f2:
+; CHECK: clc 0(2,%r2), 0(%r3)
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll %r2, [[REG]], 31
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 2)
+  ret i32 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f3(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: clc 0(3,%r2), 0(%r3)
+; CHECK-NEXT: je {{\..*}}
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 3)
+  %cmp = icmp eq i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for inequality.
+define void @f4(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f4:
+; CHECK: clc 0(4,%r2), 0(%r3)
+; CHECK-NEXT: jlh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 4)
+  %cmp = icmp ne i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested via slt.
+define void @f5(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f5:
+; CHECK: clc 0(5,%r2), 0(%r3)
+; CHECK-NEXT: jl {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 5)
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for sgt.
+define void @f6(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f6:
+; CHECK: clc 0(6,%r2), 0(%r3)
+; CHECK-NEXT: jh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 6)
+  %cmp = icmp sgt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the upper end of the CLC range.  Here the result is used both as
+; an integer and for branching.
+define i32 @f7(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f7:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll %r2, [[REG]], 31
+; CHECK: jl {{.L*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 256)
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret i32 %res
+}
+
+; 257 bytes needs two CLCs.
+define i32 @f8(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f8:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(1,%r2), 256(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257)
+  ret i32 %res
+}
+
+; Test a comparison of 258 bytes in which the CC result can be used directly.
+define void @f9(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f9:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(1,%r2), 256(%r3)
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: jl .L
+; CHECK: br %r14
+entry:
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257)
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Test the largest size that can use two CLCs.
+define i32 @f10(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f10:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(256,%r2), 256(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 512)
+  ret i32 %res
+}
+
+; Test the smallest size that needs 3 CLCs.
+define i32 @f11(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f11:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(256,%r2), 256(%r3)
+; CHECK: jlh [[LABEL]]
+; CHECK: clc 512(1,%r2), 512(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 513)
+  ret i32 %res
+}
+
+; Test the largest size than can use 3 CLCs.
+define i32 @f12(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f12:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(256,%r2), 256(%r3)
+; CHECK: jlh [[LABEL]]
+; CHECK: clc 512(256,%r2), 512(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 768)
+  ret i32 %res
+}
+
+; The next size up uses a loop instead.  We leave the more complicated
+; loop tests to memcpy-01.ll, which shares the same form.
+define i32 @f13(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f13:
+; CHECK: lghi [[COUNT:%r[0-5]]], 3
+; CHECK: [[LOOP:.L[^:]*]]:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK-DAG: la %r2, 256(%r2)
+; CHECK-DAG: la %r3, 256(%r3)
+; CHECK: brctg [[COUNT]], [[LOOP]]
+; CHECK: clc 0(1,%r2), 0(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 769)
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/memcmp-02.ll b/test/CodeGen/SystemZ/memcmp-02.ll
new file mode 100644
index 0000000..74b090d
--- /dev/null
+++ b/test/CodeGen/SystemZ/memcmp-02.ll
@@ -0,0 +1,139 @@
+; Test memcmp using CLC, with i64 results.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i64 @memcmp(i8 *%src1, i8 *%src2, i64 %size)
+
+; Zero-length comparisons should be optimized away.
+define i64 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK: lghi %r2, 0
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 0)
+  ret i64 %res
+}
+
+; Check a case where the result is used as an integer.
+define i64 @f2(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f2:
+; CHECK: clc 0(2,%r2), 0(%r3)
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll [[REG]], [[REG]], 31
+; CHECK: lgfr %r2, [[REG]]
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 2)
+  ret i64 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f3(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: clc 0(3,%r2), 0(%r3)
+; CHECK-NEXT: je {{\..*}}
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 3)
+  %cmp = icmp eq i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for inequality.
+define void @f4(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f4:
+; CHECK: clc 0(4,%r2), 0(%r3)
+; CHECK-NEXT: jlh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 4)
+  %cmp = icmp ne i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested via slt.
+define void @f5(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f5:
+; CHECK: clc 0(5,%r2), 0(%r3)
+; CHECK-NEXT: jl {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 5)
+  %cmp = icmp slt i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for sgt.
+define void @f6(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f6:
+; CHECK: clc 0(6,%r2), 0(%r3)
+; CHECK-NEXT: jh {{\..*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 6)
+  %cmp = icmp sgt i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check the upper end of the CLC range.  Here the result is used both as
+; an integer and for branching.
+define i64 @f7(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f7:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll [[REG]], [[REG]], 31
+; CHECK: lgfr %r2, [[REG]]
+; CHECK: jl {{.L*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 256)
+  %cmp = icmp slt i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret i64 %res
+}
+
+; 257 bytes needs two CLCs.
+define i64 @f8(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f8:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: jlh [[LABEL:\..*]]
+; CHECK: clc 256(1,%r2), 256(%r3)
+; CHECK: [[LABEL]]:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK: br %r14
+  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 257)
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/memcpy-01.ll b/test/CodeGen/SystemZ/memcpy-01.ll
index 7cb58b3..b53ec54 100644
--- a/test/CodeGen/SystemZ/memcpy-01.ll
+++ b/test/CodeGen/SystemZ/memcpy-01.ll
@@ -4,7 +4,9 @@
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8 *nocapture, i8 *nocapture, i32, i32, i1) nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8 *nocapture, i8 *nocapture, i64, i32, i1) nounwind
+declare void @foo(i8 *, i8 *)
 
+; Test a no-op move, i32 version.
 define void @f1(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f1:
 ; CHECK-NOT: %r2
@@ -15,6 +17,7 @@ define void @f1(i8 *%dest, i8 *%src) {
   ret void
 }
 
+; Test a no-op move, i64 version.
 define void @f2(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f2:
 ; CHECK-NOT: %r2
@@ -25,6 +28,7 @@ define void @f2(i8 *%dest, i8 *%src) {
   ret void
 }
 
+; Test a 1-byte move, i32 version.
 define void @f3(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f3:
 ; CHECK: mvc 0(1,%r2), 0(%r3)
@@ -34,6 +38,7 @@ define void @f3(i8 *%dest, i8 *%src) {
   ret void
 }
 
+; Test a 1-byte move, i64 version.
 define void @f4(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f4:
 ; CHECK: mvc 0(1,%r2), 0(%r3)
@@ -43,6 +48,7 @@ define void @f4(i8 *%dest, i8 *%src) {
   ret void
 }
 
+; Test the upper range of a single MVC, i32 version.
 define void @f5(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f5:
 ; CHECK: mvc 0(256,%r2), 0(%r3)
@@ -52,6 +58,7 @@ define void @f5(i8 *%dest, i8 *%src) {
   ret void
 }
 
+; Test the upper range of a single MVC, i64 version.
 define void @f6(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f6:
 ; CHECK: mvc 0(256,%r2), 0(%r3)
@@ -61,22 +68,168 @@ define void @f6(i8 *%dest, i8 *%src) {
   ret void
 }
 
-; 257 bytes is too big for a single MVC.  For now expect none, so that
-; the test fails and gets updated when large copies are implemented.
+; Test the first case that needs two MVCs.
 define void @f7(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f7:
-; CHECK-NOT: mvc
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(1,%r2), 256(%r3)
 ; CHECK: br %r14
   call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%dest, i8 *%src, i32 257, i32 1,
                                        i1 false)
   ret void
 }
 
+; Test the last-but-one case that needs two MVCs.
 define void @f8(i8 *%dest, i8 *%src) {
 ; CHECK-LABEL: f8:
-; CHECK-NOT: mvc
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(255,%r2), 256(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 511, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; Test the last case that needs two MVCs.
+define void @f9(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f9:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 512, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; Test an arbitrary value that uses straight-line code.
+define void @f10(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f10:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: mvc 512(256,%r2), 512(%r3)
+; CHECK: mvc 768(256,%r2), 768(%r3)
+; CHECK: mvc 1024(255,%r2), 1024(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; ...and again in cases where not all parts are in range of MVC.
+define void @f11(i8 *%srcbase, i8 *%destbase) {
+; CHECK-LABEL: f11:
+; CHECK: mvc 4000(256,%r2), 3500(%r3)
+; CHECK: lay [[NEWDEST:%r[1-5]]], 4256(%r2)
+; CHECK: mvc 0(256,[[NEWDEST]]), 3756(%r3)
+; CHECK: mvc 256(256,[[NEWDEST]]), 4012(%r3)
+; CHECK: lay [[NEWSRC:%r[1-5]]], 4268(%r3)
+; CHECK: mvc 512(256,[[NEWDEST]]), 0([[NEWSRC]])
+; CHECK: mvc 768(255,[[NEWDEST]]), 256([[NEWSRC]])
+; CHECK: br %r14
+  %dest = getelementptr i8 *%srcbase, i64 4000
+  %src = getelementptr i8* %destbase, i64 3500
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; ...and again with a destination frame base that goes out of range.
+define void @f12() {
+; CHECK-LABEL: f12:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: mvc 4076(256,%r15), 2100(%r15)
+; CHECK: lay [[NEWDEST:%r[1-5]]], 4332(%r15)
+; CHECK: mvc 0(256,[[NEWDEST]]), 2356(%r15)
+; CHECK: mvc 256(256,[[NEWDEST]]), 2612(%r15)
+; CHECK: mvc 512(256,[[NEWDEST]]), 2868(%r15)
+; CHECK: mvc 768(255,[[NEWDEST]]), 3124(%r15)
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %arr = alloca [6000 x i8]
+  %dest = getelementptr [6000 x i8] *%arr, i64 0, i64 3900
+  %src = getelementptr [6000 x i8] *%arr, i64 0, i64 1924
+  call void @foo(i8 *%dest, i8 *%src)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  call void @foo(i8 *%dest, i8 *%src)
+  ret void
+}
+
+; ...and again with a source frame base that goes out of range.
+define void @f13() {
+; CHECK-LABEL: f13:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: mvc 200(256,%r15), 3826(%r15)
+; CHECK: mvc 456(256,%r15), 4082(%r15)
+; CHECK: lay [[NEWSRC:%r[1-5]]], 4338(%r15)
+; CHECK: mvc 712(256,%r15), 0([[NEWSRC]])
+; CHECK: mvc 968(256,%r15), 256([[NEWSRC]])
+; CHECK: mvc 1224(255,%r15), 512([[NEWSRC]])
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+  %arr = alloca [6000 x i8]
+  %dest = getelementptr [6000 x i8] *%arr, i64 0, i64 24
+  %src = getelementptr [6000 x i8] *%arr, i64 0, i64 3650
+  call void @foo(i8 *%dest, i8 *%src)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+                                       i1 false)
+  call void @foo(i8 *%dest, i8 *%src)
+  ret void
+}
+
+; Test the last case that is done using straight-line code.
+define void @f14(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f14:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: mvc 512(256,%r2), 512(%r3)
+; CHECK: mvc 768(256,%r2), 768(%r3)
+; CHECK: mvc 1024(256,%r2), 1024(%r3)
+; CHECK: mvc 1280(256,%r2), 1280(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1536, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; Test the first case that is done using a loop.
+define void @f15(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f15:
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 768(%r2)
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: la %r2, 256(%r2)
+; CHECK: la %r3, 256(%r3)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 0(1,%r2), 0(%r3)
+; CHECK: br %r14
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1537, i32 1,
+                                       i1 false)
+  ret void
+}
+
+; ...and again with frame bases, where the base must be loaded into a
+; register before the loop.
+define void @f16() {
+; CHECK-LABEL: f16:
+; CHECK: brasl %r14, foo@PLT
+; CHECK-DAG: lghi [[COUNT:%r[0-5]]], 6
+; CHECK-DAG: la [[BASE:%r[0-5]]], 160(%r15)
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 2368([[BASE]])
+; CHECK: mvc 1600(256,[[BASE]]), 0([[BASE]])
+; CHECK: la [[BASE]], 256([[BASE]])
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 1600(1,[[BASE]]), 0([[BASE]])
+; CHECK: brasl %r14, foo@PLT
 ; CHECK: br %r14
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 257, i32 1,
+  %arr = alloca [3200 x i8]
+  %dest = getelementptr [3200 x i8] *%arr, i64 0, i64 1600
+  %src = getelementptr [3200 x i8] *%arr, i64 0, i64 0
+  call void @foo(i8 *%dest, i8 *%src)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1537, i32 1,
                                        i1 false)
+  call void @foo(i8 *%dest, i8 *%src)
   ret void
 }
diff --git a/test/CodeGen/SystemZ/memcpy-02.ll b/test/CodeGen/SystemZ/memcpy-02.ll
index 83b2cd8..2b01091 100644
--- a/test/CodeGen/SystemZ/memcpy-02.ll
+++ b/test/CodeGen/SystemZ/memcpy-02.ll
@@ -2,11 +2,14 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
-@g1 = global i8 1
-@g2 = global i16 2
+@g1src = global i8 1
+@g1dst = global i8 1
+@g2src = global i16 2
+@g2dst = global i16 2
 @g3 = global i32 3
 @g4 = global i64 4
-@g5 = external global fp128, align 16
+@g5src = external global fp128, align 16
+@g5dst = external global fp128, align 16
 
 ; Test the simple i8 case.
 define void @f1(i8 *%ptr1) {
@@ -237,18 +240,19 @@ define void @f19(i64 *%ptr1) {
   ret void
 }
 
-; Test that MVC is used for aligned loads and stores, even if there is
-; no way of telling whether they alias.
+; Test that MVC is not used for aligned loads and stores if there is
+; no way of telling whether they alias.  We don't want to use MVC in
+; cases where the addresses could be equal.
 define void @f20(i64 *%ptr1, i64 *%ptr2) {
 ; CHECK-LABEL: f20:
-; CHECK: mvc 0(8,%r3), 0(%r2)
+; CHECK-NOT: mvc
 ; CHECK: br %r14
   %val = load i64 *%ptr1
   store i64 %val, i64 *%ptr2
   ret void
 }
 
-; ...but if the loads aren't aligned, we can't be sure.
+; ...and again for unaligned loads and stores.
 define void @f21(i64 *%ptr1, i64 *%ptr2) {
 ; CHECK-LABEL: f21:
 ; CHECK-NOT: mvc
@@ -274,50 +278,29 @@ define void @f22(i64 %base) {
 ; Test that we can use MVC for global addresses for i8.
 define void @f23(i8 *%ptr) {
 ; CHECK-LABEL: f23:
-; CHECK: larl [[REG:%r[0-5]]], g1
-; CHECK: mvc 0(1,%r2), 0([[REG]])
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g1src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g1dst
+; CHECK: mvc 0(1,[[DST]]), 0([[SRC]])
 ; CHECK: br %r14
-  %val = load i8 *@g1
-  store i8 %val, i8 *%ptr
+  %val = load i8 *@g1src
+  store i8 %val, i8 *@g1dst
   ret void
 }
 
-; ...and again with the global on the store.
-define void @f24(i8 *%ptr) {
+; Test that we use LHRL and STHRL for i16.
+define void @f24(i16 *%ptr) {
 ; CHECK-LABEL: f24:
-; CHECK: larl [[REG:%r[0-5]]], g1
-; CHECK: mvc 0(1,[[REG]]), 0(%r2)
-; CHECK: br %r14
-  %val = load i8 *%ptr
-  store i8 %val, i8 *@g1
-  ret void
-}
-
-; Test that we use LHRL for i16.
-define void @f25(i16 *%ptr) {
-; CHECK-LABEL: f25:
-; CHECK: lhrl [[REG:%r[0-5]]], g2
-; CHECK: sth [[REG]], 0(%r2)
+; CHECK: lhrl [[REG:%r[0-5]]], g2src
+; CHECK: sthrl [[REG]], g2dst
 ; CHECK: br %r14
-  %val = load i16 *@g2
-  store i16 %val, i16 *%ptr
-  ret void
-}
-
-; ...likewise STHRL.
-define void @f26(i16 *%ptr) {
-; CHECK-LABEL: f26:
-; CHECK: lh [[REG:%r[0-5]]], 0(%r2)
-; CHECK: sthrl [[REG]], g2
-; CHECK: br %r14
-  %val = load i16 *%ptr
-  store i16 %val, i16 *@g2
+  %val = load i16 *@g2src
+  store i16 %val, i16 *@g2dst
   ret void
 }
 
 ; Test that we use LRL for i32.
-define void @f27(i32 *%ptr) {
-; CHECK-LABEL: f27:
+define void @f25(i32 *%ptr) {
+; CHECK-LABEL: f25:
 ; CHECK: lrl [[REG:%r[0-5]]], g3
 ; CHECK: st [[REG]], 0(%r2)
 ; CHECK: br %r14
@@ -327,8 +310,8 @@ define void @f27(i32 *%ptr) {
 }
 
 ; ...likewise STRL.
-define void @f28(i32 *%ptr) {
-; CHECK-LABEL: f28:
+define void @f26(i32 *%ptr) {
+; CHECK-LABEL: f26:
 ; CHECK: l [[REG:%r[0-5]]], 0(%r2)
 ; CHECK: strl [[REG]], g3
 ; CHECK: br %r14
@@ -338,8 +321,8 @@ define void @f28(i32 *%ptr) {
 }
 
 ; Test that we use LGRL for i64.
-define void @f29(i64 *%ptr) {
-; CHECK-LABEL: f29:
+define void @f27(i64 *%ptr) {
+; CHECK-LABEL: f27:
 ; CHECK: lgrl [[REG:%r[0-5]]], g4
 ; CHECK: stg [[REG]], 0(%r2)
 ; CHECK: br %r14
@@ -349,8 +332,8 @@ define void @f29(i64 *%ptr) {
 }
 
 ; ...likewise STGRL.
-define void @f30(i64 *%ptr) {
-; CHECK-LABEL: f30:
+define void @f28(i64 *%ptr) {
+; CHECK-LABEL: f28:
 ; CHECK: lg [[REG:%r[0-5]]], 0(%r2)
 ; CHECK: stgrl [[REG]], g4
 ; CHECK: br %r14
@@ -360,30 +343,20 @@ define void @f30(i64 *%ptr) {
 }
 
 ; Test that we can use MVC for global addresses for fp128.
-define void @f31(fp128 *%ptr) {
-; CHECK-LABEL: f31:
-; CHECK: larl [[REG:%r[0-5]]], g5
-; CHECK: mvc 0(16,%r2), 0([[REG]])
-; CHECK: br %r14
-  %val = load fp128 *@g5, align 16
-  store fp128 %val, fp128 *%ptr, align 16
-  ret void
-}
-
-; ...and again with the global on the store.
-define void @f32(fp128 *%ptr) {
-; CHECK-LABEL: f32:
-; CHECK: larl [[REG:%r[0-5]]], g5
-; CHECK: mvc 0(16,[[REG]]), 0(%r2)
+define void @f29(fp128 *%ptr) {
+; CHECK-LABEL: f29:
+; CHECK-DAG: larl [[SRC:%r[0-5]]], g5src
+; CHECK-DAG: larl [[DST:%r[0-5]]], g5dst
+; CHECK: mvc 0(16,[[DST]]), 0([[SRC]])
 ; CHECK: br %r14
-  %val = load fp128 *%ptr, align 16
-  store fp128 %val, fp128 *@g5, align 16
+  %val = load fp128 *@g5src, align 16
+  store fp128 %val, fp128 *@g5dst, align 16
   ret void
 }
 
 ; Test a case where offset disambiguation is enough.
-define void @f33(i64 *%ptr1) {
-; CHECK-LABEL: f33:
+define void @f30(i64 *%ptr1) {
+; CHECK-LABEL: f30:
 ; CHECK: mvc 8(8,%r2), 0(%r2)
 ; CHECK: br %r14
   %ptr2 = getelementptr i64 *%ptr1, i64 1
@@ -393,8 +366,8 @@ define void @f33(i64 *%ptr1) {
 }
 
 ; Test f21 in cases where TBAA tells us there is no alias.
-define void @f34(i64 *%ptr1, i64 *%ptr2) {
-; CHECK-LABEL: f34:
+define void @f31(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f31:
 ; CHECK: mvc 0(8,%r3), 0(%r2)
 ; CHECK: br %r14
   %val = load i64 *%ptr1, align 2, !tbaa !1
@@ -403,8 +376,8 @@ define void @f34(i64 *%ptr1, i64 *%ptr2) {
 }
 
 ; Test f21 in cases where TBAA is present but doesn't help.
-define void @f35(i64 *%ptr1, i64 *%ptr2) {
-; CHECK-LABEL: f35:
+define void @f32(i64 *%ptr1, i64 *%ptr2) {
+; CHECK-LABEL: f32:
 ; CHECK-NOT: mvc
 ; CHECK: br %r14
   %val = load i64 *%ptr1, align 2, !tbaa !1
@@ -413,5 +386,7 @@ define void @f35(i64 *%ptr1, i64 *%ptr2) {
 }
 
 !0 = metadata !{ metadata !"root" }
-!1 = metadata !{ metadata !"set1", metadata !0 }
-!2 = metadata !{ metadata !"set2", metadata !0 }
+!1 = metadata !{ metadata !3, metadata !3, i64 0 }
+!2 = metadata !{ metadata !4, metadata !4, i64 0 }
+!3 = metadata !{ metadata !"set1", metadata !0 }
+!4 = metadata !{ metadata !"set2", metadata !0 }
diff --git a/test/CodeGen/SystemZ/memset-01.ll b/test/CodeGen/SystemZ/memset-01.ll
index b272a5b..f17901c 100644
--- a/test/CodeGen/SystemZ/memset-01.ll
+++ b/test/CodeGen/SystemZ/memset-01.ll
@@ -103,22 +103,58 @@ define void @f10(i8 *%dest, i8 %val) {
   ret void
 }
 
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version.  We need two MVCs.
 define void @f11(i8 *%dest, i8 %val) {
 ; CHECK-LABEL: f11:
-; CHECK-NOT: mvc
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 %val, i32 258, i32 1, i1 false)
   ret void
 }
 
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
 define void @f12(i8 *%dest, i8 %val) {
 ; CHECK-LABEL: f12:
-; CHECK-NOT: mvc
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 258, i32 1, i1 false)
   ret void
 }
+
+; Test the largest case for which straight-line code is used.
+define void @f13(i8 *%dest, i8 %val) {
+; CHECK-LABEL: f13:
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(256,%r2), 256(%r2)
+; CHECK: mvc 513(256,%r2), 512(%r2)
+; CHECK: mvc 769(256,%r2), 768(%r2)
+; CHECK: mvc 1025(256,%r2), 1024(%r2)
+; CHECK: mvc 1281(256,%r2), 1280(%r2)
+; CHECK: br %r14
+  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 1537, i32 1,
+                                  i1 false)
+  ret void
+}
+
+; Test the next size up, which uses a loop.  We leave the other corner
+; cases to memcpy-01.ll.
+define void @f14(i8 *%dest, i8 %val) {
+; CHECK-LABEL: f14:
+; CHECK: stc %r3, 0(%r2)
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 769(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: la %r2, 256(%r2)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 1538, i32 1,
+                                  i1 false)
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/memset-02.ll b/test/CodeGen/SystemZ/memset-02.ll
index b74d907..b4724c0 100644
--- a/test/CodeGen/SystemZ/memset-02.ll
+++ b/test/CodeGen/SystemZ/memset-02.ll
@@ -139,21 +139,23 @@ define void @f14(i8 *%dest) {
   ret void
 }
 
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version.  We need two MVCs.
 define void @f15(i8 *%dest) {
 ; CHECK-LABEL: f15:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 128
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 128, i32 258, i32 1, i1 false)
   ret void
 }
 
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
 define void @f16(i8 *%dest) {
 ; CHECK-LABEL: f16:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 128
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 128, i64 258, i32 1, i1 false)
   ret void
diff --git a/test/CodeGen/SystemZ/memset-03.ll b/test/CodeGen/SystemZ/memset-03.ll
index 1d48f1a..a95f89f 100644
--- a/test/CodeGen/SystemZ/memset-03.ll
+++ b/test/CodeGen/SystemZ/memset-03.ll
@@ -140,8 +140,7 @@ define void @f14(i8 *%dest) {
 ; 7 bytes, i32 version.
 define void @f15(i8 *%dest) {
 ; CHECK-LABEL: f15:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(6,%r2), 0(%r2)
+; CHECK: xc 0(7,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 7, i32 1, i1 false)
   ret void
@@ -150,8 +149,7 @@ define void @f15(i8 *%dest) {
 ; 7 bytes, i64 version.
 define void @f16(i8 *%dest) {
 ; CHECK-LABEL: f16:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(6,%r2), 0(%r2)
+; CHECK: xc 0(7,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 7, i32 1, i1 false)
   ret void
@@ -218,8 +216,7 @@ define void @f22(i8 *%dest) {
 ; 11 bytes, i32 version.
 define void @f23(i8 *%dest) {
 ; CHECK-LABEL: f23:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(10,%r2), 0(%r2)
+; CHECK: xc 0(11,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 11, i32 1, i1 false)
   ret void
@@ -228,8 +225,7 @@ define void @f23(i8 *%dest) {
 ; 11 bytes, i64 version.
 define void @f24(i8 *%dest) {
 ; CHECK-LABEL: f24:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(10,%r2), 0(%r2)
+; CHECK: xc 0(11,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 11, i32 1, i1 false)
   ret void
@@ -258,8 +254,7 @@ define void @f26(i8 *%dest) {
 ; 13 bytes, i32 version.
 define void @f27(i8 *%dest) {
 ; CHECK-LABEL: f27:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(12,%r2), 0(%r2)
+; CHECK: xc 0(13,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 13, i32 1, i1 false)
   ret void
@@ -268,8 +263,7 @@ define void @f27(i8 *%dest) {
 ; 13 bytes, i64 version.
 define void @f28(i8 *%dest) {
 ; CHECK-LABEL: f28:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(12,%r2), 0(%r2)
+; CHECK: xc 0(13,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 13, i32 1, i1 false)
   ret void
@@ -278,8 +272,7 @@ define void @f28(i8 *%dest) {
 ; 14 bytes, i32 version.
 define void @f29(i8 *%dest) {
 ; CHECK-LABEL: f29:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(13,%r2), 0(%r2)
+; CHECK: xc 0(14,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 14, i32 1, i1 false)
   ret void
@@ -288,8 +281,7 @@ define void @f29(i8 *%dest) {
 ; 14 bytes, i64 version.
 define void @f30(i8 *%dest) {
 ; CHECK-LABEL: f30:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(13,%r2), 0(%r2)
+; CHECK: xc 0(14,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 14, i32 1, i1 false)
   ret void
@@ -298,8 +290,7 @@ define void @f30(i8 *%dest) {
 ; 15 bytes, i32 version.
 define void @f31(i8 *%dest) {
 ; CHECK-LABEL: f31:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(14,%r2), 0(%r2)
+; CHECK: xc 0(15,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 15, i32 1, i1 false)
   ret void
@@ -308,8 +299,7 @@ define void @f31(i8 *%dest) {
 ; 15 bytes, i64 version.
 define void @f32(i8 *%dest) {
 ; CHECK-LABEL: f32:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(14,%r2), 0(%r2)
+; CHECK: xc 0(15,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 15, i32 1, i1 false)
   ret void
@@ -338,8 +328,7 @@ define void @f34(i8 *%dest) {
 ; 17 bytes, i32 version.
 define void @f35(i8 *%dest) {
 ; CHECK-LABEL: f35:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(16,%r2), 0(%r2)
+; CHECK: xc 0(17,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 17, i32 1, i1 false)
   ret void
@@ -348,49 +337,46 @@ define void @f35(i8 *%dest) {
 ; 17 bytes, i64 version.
 define void @f36(i8 *%dest) {
 ; CHECK-LABEL: f36:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(16,%r2), 0(%r2)
+; CHECK: xc 0(17,%r2), 0(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 17, i32 1, i1 false)
   ret void
 }
 
-; 257 bytes, i32 version.
+; 256 bytes, i32 version.
 define void @f37(i8 *%dest) {
 ; CHECK-LABEL: f37:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: xc 0(256,%r2), 0(%r2)
 ; CHECK: br %r14
-  call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 257, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 256, i32 1, i1 false)
   ret void
 }
 
-; 257 bytes, i64 version.
+; 256 bytes, i64 version.
 define void @f38(i8 *%dest) {
 ; CHECK-LABEL: f38:
-; CHECK: mvi 0(%r2), 0
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: xc 0(256,%r2), 0(%r2)
 ; CHECK: br %r14
-  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 257, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 256, i32 1, i1 false)
   ret void
 }
 
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 257 bytes, i32 version.  We need two MVCs.
 define void @f39(i8 *%dest) {
 ; CHECK-LABEL: f39:
-; CHECK-NOT: mvc
+; CHECK: xc 0(256,%r2), 0(%r2)
+; CHECK: xc 256(1,%r2), 256(%r2)
 ; CHECK: br %r14
-  call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 258, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 257, i32 1, i1 false)
   ret void
 }
 
-; 258 bytes, i64 version, with the same comments as above.
+; 257 bytes, i64 version.
 define void @f40(i8 *%dest) {
 ; CHECK-LABEL: f40:
-; CHECK-NOT: mvc
+; CHECK: xc 0(256,%r2), 0(%r2)
+; CHECK: xc 256(1,%r2), 256(%r2)
 ; CHECK: br %r14
-  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 258, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 257, i32 1, i1 false)
   ret void
 }
diff --git a/test/CodeGen/SystemZ/memset-04.ll b/test/CodeGen/SystemZ/memset-04.ll
index 9288692..7906e8d 100644
--- a/test/CodeGen/SystemZ/memset-04.ll
+++ b/test/CodeGen/SystemZ/memset-04.ll
@@ -375,21 +375,23 @@ define void @f38(i8 *%dest) {
   ret void
 }
 
-; 258 bytes, i32 version.  258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version.  We need two MVCs.
 define void @f39(i8 *%dest) {
 ; CHECK-LABEL: f39:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 255
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i32(i8 *%dest, i8 -1, i32 258, i32 1, i1 false)
   ret void
 }
 
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
 define void @f40(i8 *%dest) {
 ; CHECK-LABEL: f40:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 255
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
 ; CHECK: br %r14
   call void @llvm.memset.p0i8.i64(i8 *%dest, i8 -1, i64 258, i32 1, i1 false)
   ret void
diff --git a/test/CodeGen/SystemZ/or-08.ll b/test/CodeGen/SystemZ/or-08.ll
new file mode 100644
index 0000000..8f5bf31
--- /dev/null
+++ b/test/CodeGen/SystemZ/or-08.ll
@@ -0,0 +1,57 @@
+; Test memory-to-memory ORs.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test the simple i8 case.
+define void @f1(i8 *%ptr1) {
+; CHECK-LABEL: f1:
+; CHECK: oc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %old = load i8 *%ptr2
+  %or = or i8 %val, %old
+  store i8 %or, i8 *%ptr2
+  ret void
+}
+
+; Test the simple i16 case.
+define void @f2(i16 *%ptr1) {
+; CHECK-LABEL: f2:
+; CHECK: oc 2(2,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i16 *%ptr1, i64 1
+  %val = load i16 *%ptr1
+  %old = load i16 *%ptr2
+  %or = or i16 %val, %old
+  store i16 %or, i16 *%ptr2
+  ret void
+}
+
+; Test the simple i32 case.
+define void @f3(i32 *%ptr1) {
+; CHECK-LABEL: f3:
+; CHECK: oc 4(4,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i32 *%ptr1, i64 1
+  %val = load i32 *%ptr1
+  %old = load i32 *%ptr2
+  %or = or i32 %old, %val
+  store i32 %or, i32 *%ptr2
+  ret void
+}
+
+; Test the i64 case.
+define void @f4(i64 *%ptr1) {
+; CHECK-LABEL: f4:
+; CHECK: oc 8(8,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2
+  %or = or i64 %old, %val
+  store i64 %or, i64 *%ptr2
+  ret void
+}
+
+; Leave other more complicated tests to and-08.ll.
diff --git a/test/CodeGen/SystemZ/prefetch-01.ll b/test/CodeGen/SystemZ/prefetch-01.ll
new file mode 100644
index 0000000..bb7fea9
--- /dev/null
+++ b/test/CodeGen/SystemZ/prefetch-01.ll
@@ -0,0 +1,87 @@
+; Test data prefetching.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @llvm.prefetch(i8*, i32, i32, i32)
+
+@g = global [4096 x i8] zeroinitializer
+
+; Check that instruction read prefetches are ignored.
+define void @f1(i8 *%ptr) {
+; CHECK-LABEL: f1:
+; CHECK-NOT: %r2
+; CHECK: br %r14
+  call void @llvm.prefetch(i8 *%ptr, i32 0, i32 0, i32 0)
+  ret void
+}
+
+; Check that instruction write prefetches are ignored.
+define void @f2(i8 *%ptr) {
+; CHECK-LABEL: f2:
+; CHECK-NOT: %r2
+; CHECK: br %r14
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 0)
+  ret void
+}
+
+; Check data read prefetches.
+define void @f3(i8 *%ptr) {
+; CHECK-LABEL: f3:
+; CHECK: pfd 1, 0(%r2)
+; CHECK: br %r14
+  call void @llvm.prefetch(i8 *%ptr, i32 0, i32 0, i32 1)
+  ret void
+}
+
+; Check data write prefetches.
+define void @f4(i8 *%ptr) {
+; CHECK-LABEL: f4:
+; CHECK: pfd 2, 0(%r2)
+; CHECK: br %r14
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 1)
+  ret void
+}
+
+; Check an address at the negative end of the range.
+define void @f5(i8 *%base, i64 %index) {
+; CHECK-LABEL: f5:
+; CHECK: pfd 2, -524288({{%r2,%r3|%r3,%r2}})
+; CHECK: br %r14
+  %add = add i64 %index, -524288
+  %ptr = getelementptr i8 *%base, i64 %add
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 1)
+  ret void
+}
+
+; Check an address at the positive end of the range.
+define void @f6(i8 *%base, i64 %index) {
+; CHECK-LABEL: f6:
+; CHECK: pfd 2, 524287({{%r2,%r3|%r3,%r2}})
+; CHECK: br %r14
+  %add = add i64 %index, 524287
+  %ptr = getelementptr i8 *%base, i64 %add
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 1)
+  ret void
+}
+
+; Check that the next address up still compiles.
+define void @f7(i8 *%base, i64 %index) {
+; CHECK-LABEL: f7:
+; CHECK: 524288
+; CHECK: pfd 2,
+; CHECK: br %r14
+  %add = add i64 %index, 524288
+  %ptr = getelementptr i8 *%base, i64 %add
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 1)
+  ret void
+}
+
+; Check pc-relative prefetches.
+define void @f8() {
+; CHECK-LABEL: f8:
+; CHECK: pfdrl 2, g
+; CHECK: br %r14
+  %ptr = getelementptr [4096 x i8] *@g, i64 0, i64 0
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 0, i32 1)
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/risbg-01.ll b/test/CodeGen/SystemZ/risbg-01.ll
index 85de6dc..a4d11fd 100644
--- a/test/CodeGen/SystemZ/risbg-01.ll
+++ b/test/CodeGen/SystemZ/risbg-01.ll
@@ -1,6 +1,7 @@
 ; Test sequences that can use RISBG with a zeroed first operand.
+; The tests here assume that RISBLG isn't available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 ; Test an extraction of bit 0 from a right-shifted value.
 define i32 @f1(i32 %foo) {
@@ -455,3 +456,17 @@ define i64 @f40(i64 %foo, i64 *%dest) {
   %and = and i64 %shl, 2147483647
   ret i64 %and
 }
+
+; In this case the sign extension is converted to a pair of 32-bit shifts,
+; which is then extended to 64 bits.  We previously used the wrong bit size
+; when testing whether the shifted-in bits of the shift right were significant.
+define i64 @f41(i1 %x) {
+; CHECK-LABEL: f41:
+; CHECK: sll %r2, 31
+; CHECK: sra %r2, 31
+; CHECK: llgcr %r2, %r2
+; CHECK: br %r14
+  %ext = sext i1 %x to i8
+  %ext2 = zext i8 %ext to i64
+  ret i64 %ext2
+}
diff --git a/test/CodeGen/SystemZ/setcc-01.ll b/test/CodeGen/SystemZ/setcc-01.ll
new file mode 100644
index 0000000..4626760
--- /dev/null
+++ b/test/CodeGen/SystemZ/setcc-01.ll
@@ -0,0 +1,74 @@
+; Test SETCC for every integer condition.  The tests here assume that
+; RISBLG isn't available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+
+; Test CC in { 0 }, with 3 don't care.
+define i32 @f1(i32 %a, i32 %b) {
+; CHECK-LABEL: f1:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = icmp eq i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1 }, with 3 don't care.
+define i32 @f2(i32 %a, i32 %b) {
+; CHECK-LABEL: f2:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 36
+; CHECK: br %r14
+  %cond = icmp slt i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 1 }, with 3 don't care.
+define i32 @f3(i32 %a, i32 %b) {
+; CHECK-LABEL: f3:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -536870912
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = icmp sle i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 2 }, with 3 don't care.
+define i32 @f4(i32 %a, i32 %b) {
+; CHECK-LABEL: f4:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 35
+; CHECK: br %r14
+  %cond = icmp sgt i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 2 }, with 3 don't care.
+define i32 @f5(i32 %a, i32 %b) {
+; CHECK-LABEL: f5:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: xilf [[REG]], 4294967295
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 36
+; CHECK: br %r14
+  %cond = icmp sge i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1, 2 }, with 3 don't care.
+define i32 @f6(i32 %a, i32 %b) {
+; CHECK-LABEL: f6:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 1879048192
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = icmp ne i32 %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/setcc-02.ll b/test/CodeGen/SystemZ/setcc-02.ll
new file mode 100644
index 0000000..6a7be47
--- /dev/null
+++ b/test/CodeGen/SystemZ/setcc-02.ll
@@ -0,0 +1,174 @@
+; Test SETCC for every floating-point condition.  The tests here assume that
+; RISBLG isn't available.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
+
+; Test CC in { 0 }
+define i32 @f1(float %a, float %b) {
+; CHECK-LABEL: f1:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp oeq float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1 }
+define i32 @f2(float %a, float %b) {
+; CHECK-LABEL: f2:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, -268435456
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp olt float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 1 }
+define i32 @f3(float %a, float %b) {
+; CHECK-LABEL: f3:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -536870912
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ole float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 2 }
+define i32 @f4(float %a, float %b) {
+; CHECK-LABEL: f4:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, 1342177280
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ogt float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 2 }
+define i32 @f5(float %a, float %b) {
+; CHECK-LABEL: f5:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: xilf [[REG]], 4294967295
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 36
+; CHECK: br %r14
+  %cond = fcmp oge float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1, 2 }
+define i32 @f6(float %a, float %b) {
+; CHECK-LABEL: f6:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: afi [[REG]], 268435456
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 35
+; CHECK: br %r14
+  %cond = fcmp one float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 1, 2 }
+define i32 @f7(float %a, float %b) {
+; CHECK-LABEL: f7:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, -805306368
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ord float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 3 }
+define i32 @f8(float %a, float %b) {
+; CHECK-LABEL: f8:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 1342177280
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp uno float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 3 }
+define i32 @f9(float %a, float %b) {
+; CHECK-LABEL: f9:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: afi [[REG]], -268435456
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 35
+; CHECK: br %r14
+  %cond = fcmp ueq float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1, 3 }
+define i32 @f10(float %a, float %b) {
+; CHECK-LABEL: f10:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 36
+; CHECK: br %r14
+  %cond = fcmp ult float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 1, 3 }
+define i32 @f11(float %a, float %b) {
+; CHECK-LABEL: f11:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, -805306368
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp ule float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 2, 3 }
+define i32 @f12(float %a, float %b) {
+; CHECK-LABEL: f12:
+; CHECK: ipm [[REG:%r[0-5]]]
+; CHECK-NEXT: risbg %r2, [[REG]], 63, 191, 35
+; CHECK: br %r14
+  %cond = fcmp ugt float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 0, 2, 3 }
+define i32 @f13(float %a, float %b) {
+; CHECK-LABEL: f13:
+; CHECK: ipm %r2
+; CHECK-NEXT: xilf %r2, 268435456
+; CHECK-NEXT: afi %r2, 1879048192
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp uge float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
+
+; Test CC in { 1, 2, 3 }
+define i32 @f14(float %a, float %b) {
+; CHECK-LABEL: f14:
+; CHECK: ipm %r2
+; CHECK-NEXT: afi %r2, 1879048192
+; CHECK-NEXT: srl %r2, 31
+; CHECK: br %r14
+  %cond = fcmp une float %a, %b
+  %res = zext i1 %cond to i32
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/shift-10.ll b/test/CodeGen/SystemZ/shift-10.ll
new file mode 100644
index 0000000..46ed218
--- /dev/null
+++ b/test/CodeGen/SystemZ/shift-10.ll
@@ -0,0 +1,78 @@
+; Test compound shifts.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test a shift right followed by a sign extension.  This can use two shifts.
+define i64 @f1(i32 %a) {
+; CHECK-LABEL: f1:
+; CHECK: sllg [[REG:%r[0-5]]], %r2, 62
+; CHECK: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %shr = lshr i32 %a, 1
+  %trunc = trunc i32 %shr to i1
+  %ext = sext i1 %trunc to i64
+  ret i64 %ext
+}
+
+; ...and again with the highest shift count.
+define i64 @f2(i32 %a) {
+; CHECK-LABEL: f2:
+; CHECK: sllg [[REG:%r[0-5]]], %r2, 32
+; CHECK: srag %r2, [[REG]], 63
+; CHECK: br %r14
+  %shr = lshr i32 %a, 31
+  %trunc = trunc i32 %shr to i1
+  %ext = sext i1 %trunc to i64
+  ret i64 %ext
+}
+
+; Test a left shift that of an extended right shift in a case where folding
+; is possible.
+define i64 @f3(i32 %a) {
+; CHECK-LABEL: f3:
+; CHECK: risbg %r2, %r2, 27, 181, 9
+; CHECK: br %r14
+  %shr = lshr i32 %a, 1
+  %ext = zext i32 %shr to i64
+  %shl = shl i64 %ext, 10
+  %and = and i64 %shl, 137438952960
+  ret i64 %and
+}
+
+; ...and again with a larger right shift.
+define i64 @f4(i32 %a) {
+; CHECK-LABEL: f4:
+; CHECK: risbg %r2, %r2, 30, 158, 3
+; CHECK: br %r14
+  %shr = lshr i32 %a, 30
+  %ext = sext i32 %shr to i64
+  %shl = shl i64 %ext, 33
+  %and = and i64 %shl, 8589934592
+  ret i64 %and
+}
+
+; Repeat the previous test in a case where all bits outside the
+; bottom 3 matter.
+define i64 @f5(i32 %a) {
+; CHECK-LABEL: f5:
+; CHECK: risbg %r2, %r2, 29, 158, 3
+; CHECK: lhi %r2, 7
+; CHECK: br %r14
+  %shr = lshr i32 %a, 30
+  %ext = sext i32 %shr to i64
+  %shl = shl i64 %ext, 33
+  %or = or i64 %shl, 7
+  ret i64 %or
+}
+
+; Test that SRA gets replaced with SRL if the sign bit is the only one
+; that matters.
+define i64 @f6(i64 %a) {
+; CHECK-LABEL: f6:
+; CHECK: risbg %r2, %r2, 55, 183, 19
+; CHECK: br %r14
+  %shl = shl i64 %a, 10
+  %shr = ashr i64 %shl, 60
+  %and = and i64 %shr, 256
+  ret i64 %and
+}
diff --git a/test/CodeGen/SystemZ/spill-01.ll b/test/CodeGen/SystemZ/spill-01.ll
index 9de89d6..ca64a88 100644
--- a/test/CodeGen/SystemZ/spill-01.ll
+++ b/test/CodeGen/SystemZ/spill-01.ll
@@ -1,6 +1,7 @@
-; Test spilling using MVC.
+; Test spilling using MVC.  The tests here assume z10 register pressure,
+; without the high words being available.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
 
 declare void @foo()
 
diff --git a/test/CodeGen/SystemZ/strcmp-01.ll b/test/CodeGen/SystemZ/strcmp-01.ll
new file mode 100644
index 0000000..122c160
--- /dev/null
+++ b/test/CodeGen/SystemZ/strcmp-01.ll
@@ -0,0 +1,70 @@
+; Test strcmp using CLST, i32 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare signext i32 @strcmp(i8 *%src1, i8 *%src2)
+
+; Check a case where the result is used as an integer.
+define i32 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll %r2, [[REG]], 31
+; CHECK: br %r14
+  %res = call i32 @strcmp(i8 *%src1, i8 *%src2)
+  ret i32 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f2(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f2:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: je {{\.L.*}}
+; CHECK: br %r14
+  %res = call i32 @strcmp(i8 *%src1, i8 *%src2)
+  %cmp = icmp eq i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Test a case where the result is used both as an integer and for
+; branching.
+define i32 @f3(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll %r2, [[REG]], 31
+; CHECK: jl {{\.L*}}
+; CHECK: br %r14
+entry:
+  %res = call i32 @strcmp(i8 *%src1, i8 *%src2)
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, i32 *%dest
+  br label %exit
+
+exit:
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/strcmp-02.ll b/test/CodeGen/SystemZ/strcmp-02.ll
new file mode 100644
index 0000000..27bd00b
--- /dev/null
+++ b/test/CodeGen/SystemZ/strcmp-02.ll
@@ -0,0 +1,72 @@
+; Test strcmp using CLST, i64 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i64 @strcmp(i8 *%src1, i8 *%src2)
+
+; Check a case where the result is used as an integer.
+define i64 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll [[REG]], [[REG]], 31
+; CHECK: lgfr %r2, [[REG]]
+; CHECK: br %r14
+  %res = call i64 @strcmp(i8 *%src1, i8 *%src2)
+  ret i64 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f2(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f2:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: je {{\.L.*}}
+; CHECK: br %r14
+  %res = call i64 @strcmp(i8 *%src1, i8 *%src2)
+  %cmp = icmp eq i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Test a case where the result is used both as an integer and for
+; branching.
+define i64 @f3(i8 *%src1, i8 *%src2, i64 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: clst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: ipm [[REG:%r[0-5]]]
+; CHECK: srl [[REG]], 28
+; CHECK: rll [[REG]], [[REG]], 31
+; CHECK: lgfr %r2, [[REG]]
+; CHECK: jl {{\.L*}}
+; CHECK: br %r14
+entry:
+  %res = call i64 @strcmp(i8 *%src1, i8 *%src2)
+  %cmp = icmp slt i64 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i64 0, i64 *%dest
+  br label %exit
+
+exit:
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/strcpy-01.ll b/test/CodeGen/SystemZ/strcpy-01.ll
new file mode 100644
index 0000000..29bab62
--- /dev/null
+++ b/test/CodeGen/SystemZ/strcpy-01.ll
@@ -0,0 +1,50 @@
+; Test strcpy using MVST.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i8 *@strcpy(i8 *%dest, i8 *%src)
+declare i8 *@stpcpy(i8 *%dest, i8 *%src)
+
+; Check strcpy.
+define i8 *@f1(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lgr [[REG:%r[145]]], %r2
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: mvst [[REG]], %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NOT: %r2
+; CHECK: br %r14
+  %res = call i8 *@strcpy(i8 *%dest, i8 *%src)
+  ret i8 *%res
+}
+
+; Check stpcpy.
+define i8 *@f2(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f2:
+; CHECK: lhi %r0, 0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: mvst %r2, %r3
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NOT: %r2
+; CHECK: br %r14
+  %res = call i8 *@stpcpy(i8 *%dest, i8 *%src)
+  ret i8 *%res
+}
+
+; Check correct operation with other loads and stores.  The load must
+; come before the loop and the store afterwards.
+define i32 @f3(i32 %dummy, i8 *%dest, i8 *%src, i32 *%resptr, i32 *%storeptr) {
+; CHECK-LABEL: f3:
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: l %r2, 0(%r5)
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: mvst %r3, %r4
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK: mvhi 0(%r6), 0
+; CHECK: br %r14
+  %res = load i32 *%resptr
+  %unused = call i8 *@strcpy(i8 *%dest, i8 *%src)
+  store i32 0, i32 *%storeptr
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/strlen-01.ll b/test/CodeGen/SystemZ/strlen-01.ll
new file mode 100644
index 0000000..16161d4
--- /dev/null
+++ b/test/CodeGen/SystemZ/strlen-01.ll
@@ -0,0 +1,39 @@
+; Test strlen using SRST, i64 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i64 @strlen(i8 *%src)
+declare i64 @strnlen(i8 *%src, i64 %len)
+
+; Test strlen with its proper i64 prototype.  It would also be valid for
+; the uses of %r3 and REG after the LGR to be swapped.
+define i64 @f1(i32 %dummy, i8 *%src) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lghi %r2, 0
+; CHECK-DAG: lgr [[REG:%r[145]]], %r3
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: srst %r2, [[REG]]
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: sgr %r2, %r3
+; CHECK: br %r14
+  %res = call i64 @strlen(i8 *%src)
+  ret i64 %res
+}
+
+; Test strnlen with its proper i64 prototype.
+define i64 @f2(i64 %len, i8 *%src) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: agr %r2, %r3
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lgr [[REG:%r[145]]], %r3
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: srst %r2, [[REG]]
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: sgr %r2, %r3
+; CHECK: br %r14
+  %res = call i64 @strnlen(i8 *%src, i64 %len)
+  ret i64 %res
+}
diff --git a/test/CodeGen/SystemZ/strlen-02.ll b/test/CodeGen/SystemZ/strlen-02.ll
new file mode 100644
index 0000000..e1abbff
--- /dev/null
+++ b/test/CodeGen/SystemZ/strlen-02.ll
@@ -0,0 +1,39 @@
+; Test strlen using SRST, i32 version.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare i32 @strlen(i8 *%src)
+declare i32 @strnlen(i8 *%src, i32 %len)
+
+; Test strlen with an i32-based prototype.  It would also be valid for
+; the uses of %r3 and REG after the LGR to be swapped.
+define i32 @f1(i32 %dummy, i8 *%src) {
+; CHECK-LABEL: f1:
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lghi %r2, 0
+; CHECK-DAG: lgr [[REG:%r[145]]], %r3
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: srst %r2, [[REG]]
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: sgr %r2, %r3
+; CHECK: br %r14
+  %res = call i32 @strlen(i8 *%src)
+  ret i32 %res
+}
+
+; Test strnlen with an i32-based prototype.
+define i32 @f2(i32 zeroext %len, i8 *%src) {
+; CHECK-LABEL: f2:
+; CHECK-DAG: agr %r2, %r3
+; CHECK-DAG: lhi %r0, 0
+; CHECK-DAG: lgr [[REG:%r[145]]], %r3
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK-NEXT: srst %r2, [[REG]]
+; CHECK-NEXT: jo [[LABEL]]
+; CHECK-NEXT: BB#{{[0-9]+}}
+; CHECK-NEXT: sgr %r2, %r3
+; CHECK: br %r14
+  %res = call i32 @strnlen(i8 *%src, i32 %len)
+  ret i32 %res
+}
diff --git a/test/CodeGen/SystemZ/unaligned-01.ll b/test/CodeGen/SystemZ/unaligned-01.ll
index 621069d..526a068 100644
--- a/test/CodeGen/SystemZ/unaligned-01.ll
+++ b/test/CodeGen/SystemZ/unaligned-01.ll
@@ -1,7 +1,10 @@
 ; Check that unaligned accesses are allowed in general.  We check the
 ; few exceptions (like CRL) in their respective test files.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; FIXME: -combiner-alias-analysis (the default for SystemZ) stops
+;        f1 from being optimized.
+; RUN: llc < %s -mtriple=s390x-linux-gnu -combiner-alias-analysis=false \
+; RUN:   | FileCheck %s
 
 ; Check that these four byte stores become a single word store.
 define void @f1(i8 *%ptr) {
diff --git a/test/CodeGen/SystemZ/xor-08.ll b/test/CodeGen/SystemZ/xor-08.ll
new file mode 100644
index 0000000..8cba41e
--- /dev/null
+++ b/test/CodeGen/SystemZ/xor-08.ll
@@ -0,0 +1,57 @@
+; Test memory-to-memory XORs.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+; Test the simple i8 case.
+define void @f1(i8 *%ptr1) {
+; CHECK-LABEL: f1:
+; CHECK: xc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i8 *%ptr1, i64 1
+  %val = load i8 *%ptr1
+  %old = load i8 *%ptr2
+  %xor = xor i8 %val, %old
+  store i8 %xor, i8 *%ptr2
+  ret void
+}
+
+; Test the simple i16 case.
+define void @f2(i16 *%ptr1) {
+; CHECK-LABEL: f2:
+; CHECK: xc 2(2,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i16 *%ptr1, i64 1
+  %val = load i16 *%ptr1
+  %old = load i16 *%ptr2
+  %xor = xor i16 %val, %old
+  store i16 %xor, i16 *%ptr2
+  ret void
+}
+
+; Test the simple i32 case.
+define void @f3(i32 *%ptr1) {
+; CHECK-LABEL: f3:
+; CHECK: xc 4(4,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i32 *%ptr1, i64 1
+  %val = load i32 *%ptr1
+  %old = load i32 *%ptr2
+  %xor = xor i32 %old, %val
+  store i32 %xor, i32 *%ptr2
+  ret void
+}
+
+; Test the i64 case.
+define void @f4(i64 *%ptr1) {
+; CHECK-LABEL: f4:
+; CHECK: xc 8(8,%r2), 0(%r2)
+; CHECK: br %r14
+  %ptr2 = getelementptr i64 *%ptr1, i64 1
+  %val = load i64 *%ptr1
+  %old = load i64 *%ptr2
+  %xor = xor i64 %old, %val
+  store i64 %xor, i64 *%ptr2
+  ret void
+}
+
+; Leave other more complicated tests to and-08.ll.
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index f5b3739..b87bf24 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -47,25 +47,26 @@ declare double @sqrt(double) nounwind readonly
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!5}
+!llvm.module.flags = !{!104}
 !0 = metadata !{i32 46, i32 0, metadata !1, null}
 !1 = metadata !{i32 524299, metadata !101, metadata !2, i32 44, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !2 = metadata !{i32 524299, metadata !101, metadata !3, i32 44, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
 !3 = metadata !{i32 524334, metadata !101, null, metadata !"getClosestDiagonal3", metadata !"getClosestDiagonal3", metadata !"_Z19getClosestDiagonal3ii", i32 44, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !4 = metadata !{i32 524329, metadata !101} ; [ DW_TAG_file_type ]
 !5 = metadata !{i32 524305, metadata !101, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 00)", i1 true, metadata !"", i32 0, metadata !102, metadata !102, metadata !103, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{metadata !8, metadata !22, metadata !22}
-!8 = metadata !{i32 524307, metadata !99, null, metadata !"ggVector3", i32 66, i64 192, i64 32, i64 0, i32 0, null, metadata !10, i32 0, null} ; [ DW_TAG_structure_type ]
+!8 = metadata !{i32 524307, metadata !99, null, metadata !"ggVector3", i32 66, i64 192, i64 32, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [ggVector3] [line 66, size 192, align 32, offset 0] [def] [from ]
 !9 = metadata !{i32 524329, metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src", metadata !5} ; [ DW_TAG_file_type ]
 !99 = metadata !{metadata !"ggVector3.h", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
 !10 = metadata !{metadata !11, metadata !16, metadata !23, metadata !26, metadata !29, metadata !30, metadata !35, metadata !36, metadata !37, metadata !41, metadata !42, metadata !43, metadata !46, metadata !47, metadata !48, metadata !52, metadata !53, metadata !54, metadata !57, metadata !60, metadata !63, metadata !66, metadata !70, metadata !71, metadata !74, metadata !75, metadata !76, metadata !77, metadata !78, metadata !81, metadata !82, metadata !83, metadata !84, metadata !85, metadata !88, metadata !89, metadata !90}
 !11 = metadata !{i32 524301, metadata !99, metadata !8, metadata !"e", i32 160, i64 192, i64 32, i64 0, i32 0, metadata !12} ; [ DW_TAG_member ]
-!12 = metadata !{i32 524289, metadata !101, metadata !4, metadata !"", i32 0, i64 192, i64 32, i64 0, i32 0, metadata !13, metadata !14, i32 0, null} ; [ DW_TAG_array_type ]
+!12 = metadata !{i32 524289, metadata !101, metadata !4, metadata !"", i32 0, i64 192, i64 32, i64 0, i32 0, metadata !13, metadata !14, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 192, align 32, offset 0] [from double]
 !13 = metadata !{i32 524324, metadata !101, metadata !4, metadata !"double", i32 0, i64 64, i64 32, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
 !14 = metadata !{metadata !15}
 !15 = metadata !{i32 524321, i64 0, i64 3}        ; [ DW_TAG_subrange_type ]
 !16 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 72, metadata !17, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!17 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !18, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !18 = metadata !{null, metadata !19, metadata !20}
 !19 = metadata !{i32 524303, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 64, metadata !8} ; [ DW_TAG_pointer_type ]
 !20 = metadata !{i32 524310, metadata !100, null, metadata !"ggBoolean", i32 478, i64 0, i64 0, i64 0, i32 0, metadata !22} ; [ DW_TAG_typedef ]
@@ -73,69 +74,69 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !100 = metadata !{metadata !"math.h", metadata !"/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS4.2.Internal.sdk/usr/include/architecture/arm"}
 !22 = metadata !{i32 524324, metadata !101, metadata !4, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !23 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 73, metadata !24, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!24 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!24 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !25, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !25 = metadata !{null, metadata !19}
 !26 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 74, metadata !27, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!27 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!27 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !28 = metadata !{null, metadata !19, metadata !13, metadata !13, metadata !13}
 !29 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"Set", metadata !"Set", metadata !"_ZN9ggVector33SetEddd", i32 81, metadata !27, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !30 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"x", metadata !"x", metadata !"_ZNK9ggVector31xEv", i32 82, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!31 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !32, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!31 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !32, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !32 = metadata !{metadata !13, metadata !33}
 !33 = metadata !{i32 524303, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 64, metadata !34} ; [ DW_TAG_pointer_type ]
 !34 = metadata !{i32 524326, metadata !101, metadata !4, metadata !"", i32 0, i64 192, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_const_type ]
 !35 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"y", metadata !"y", metadata !"_ZNK9ggVector31yEv", i32 83, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !36 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"z", metadata !"z", metadata !"_ZNK9ggVector31zEv", i32 84, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !37 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"x", metadata !"x", metadata !"_ZN9ggVector31xEv", i32 85, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!38 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !39, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!38 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !39, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !39 = metadata !{metadata !40, metadata !19}
 !40 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"double", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !13} ; [ DW_TAG_reference_type ]
 !41 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"y", metadata !"y", metadata !"_ZN9ggVector31yEv", i32 86, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !42 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"z", metadata !"z", metadata !"_ZN9ggVector31zEv", i32 87, metadata !38, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !43 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetX", metadata !"SetX", metadata !"_ZN9ggVector34SetXEd", i32 88, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!44 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !45, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!44 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !45 = metadata !{null, metadata !19, metadata !13}
 !46 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetY", metadata !"SetY", metadata !"_ZN9ggVector34SetYEd", i32 89, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !47 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"SetZ", metadata !"SetZ", metadata !"_ZN9ggVector34SetZEd", i32 90, metadata !44, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !48 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"ggVector3", metadata !"ggVector3", metadata !"", i32 92, metadata !49, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!49 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !50, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!49 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !50, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !50 = metadata !{null, metadata !19, metadata !51}
 !51 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !34} ; [ DW_TAG_reference_type ]
 !52 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"tolerance", metadata !"tolerance", metadata !"_ZNK9ggVector39toleranceEv", i32 100, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !53 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"tolerance", metadata !"tolerance", metadata !"_ZN9ggVector39toleranceEv", i32 101, metadata !38, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !54 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator+", metadata !"operator+", metadata !"_ZNK9ggVector3psEv", i32 107, metadata !55, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!55 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !56, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!55 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !56, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !56 = metadata !{metadata !51, metadata !33}
 !57 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator-", metadata !"operator-", metadata !"_ZNK9ggVector3ngEv", i32 108, metadata !58, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!58 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !59, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!58 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !59, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !59 = metadata !{metadata !8, metadata !33}
 !60 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator[]", metadata !"operator[]", metadata !"_ZNK9ggVector3ixEi", i32 290, metadata !61, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!61 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !62, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!61 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !62, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !62 = metadata !{metadata !13, metadata !33, metadata !22}
 !63 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator[]", metadata !"operator[]", metadata !"_ZN9ggVector3ixEi", i32 278, metadata !64, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!64 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !65, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!64 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !65, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !65 = metadata !{metadata !40, metadata !19, metadata !22}
 !66 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator+=", metadata !"operator+=", metadata !"_ZN9ggVector3pLERKS_", i32 303, metadata !67, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!67 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !68, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!67 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !68, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !68 = metadata !{metadata !69, metadata !19, metadata !51}
 !69 = metadata !{i32 524304, metadata !101, metadata !4, metadata !"ggVector3", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_reference_type ]
 !70 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator-=", metadata !"operator-=", metadata !"_ZN9ggVector3mIERKS_", i32 310, metadata !67, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !71 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator*=", metadata !"operator*=", metadata !"_ZN9ggVector3mLEd", i32 317, metadata !72, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!72 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !73, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!72 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !73, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !73 = metadata !{metadata !69, metadata !19, metadata !13}
 !74 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"operator/=", metadata !"operator/=", metadata !"_ZN9ggVector3dVEd", i32 324, metadata !72, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !75 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"length", metadata !"length", metadata !"_ZNK9ggVector36lengthEv", i32 121, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !76 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"squaredLength", metadata !"squaredLength", metadata !"_ZNK9ggVector313squaredLengthEv", i32 122, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !77 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"MakeUnitVector", metadata !"MakeUnitVector", metadata !"_ZN9ggVector314MakeUnitVectorEv", i32 217, metadata !24, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !78 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"Perturb", metadata !"Perturb", metadata !"_ZNK9ggVector37PerturbEdd", i32 126, metadata !79, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!79 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !80, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!79 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !80, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !80 = metadata !{metadata !8, metadata !33, metadata !13, metadata !13}
 !81 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"maxComponent", metadata !"maxComponent", metadata !"_ZNK9ggVector312maxComponentEv", i32 128, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !82 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"minComponent", metadata !"minComponent", metadata !"_ZNK9ggVector312minComponentEv", i32 129, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !83 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"maxAbsComponent", metadata !"maxAbsComponent", metadata !"_ZNK9ggVector315maxAbsComponentEv", i32 131, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !84 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"minAbsComponent", metadata !"minAbsComponent", metadata !"_ZNK9ggVector315minAbsComponentEv", i32 132, metadata !31, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !85 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMinComponent", metadata !"indexOfMinComponent", metadata !"_ZNK9ggVector319indexOfMinComponentEv", i32 133, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!86 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !87, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!86 = metadata !{i32 524309, metadata !101, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !87, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !87 = metadata !{metadata !22, metadata !33}
 !88 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMinAbsComponent", metadata !"indexOfMinAbsComponent", metadata !"_ZNK9ggVector322indexOfMinAbsComponentEv", i32 137, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !89 = metadata !{i32 524334, metadata !9, metadata !8, metadata !"indexOfMaxComponent", metadata !"indexOfMaxComponent", metadata !"_ZNK9ggVector319indexOfMaxComponentEv", i32 146, metadata !86, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
@@ -151,3 +152,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !101 = metadata !{metadata !"ggEdgeDiscrepancy.cc", metadata !"/Volumes/Home/grosbaj/sources/llvm-externals/speccpu2000/benchspec/CINT2000/252.eon/src"}
 !102 = metadata !{i32 0}
 !103 = metadata !{metadata !3}
+!104 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/Thumb/PR17309.ll b/test/CodeGen/Thumb/PR17309.ll
new file mode 100644
index 0000000..b7b08e9
--- /dev/null
+++ b/test/CodeGen/Thumb/PR17309.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mtriple thumbv5-none-linux-gnueabi < %s | FileCheck %s
+
+%struct.C = type { [1000 x i8] }
+%struct.S = type { [1000 x i16] }
+%struct.I = type { [1000 x i32] }
+
+;CHECK-LABEL: pass_C:
+;CHECK-NOT: ldrb    r{{[0-9]+}}, [{{.*}}], #1
+;CHECK-NOT: strb    r{{[0-9]+}}, [{{.*}}], #1
+define void @pass_C() #0 {
+entry:
+  %c = alloca %struct.C, align 1
+  %0 = getelementptr inbounds %struct.C* %c, i32 0, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 1000, i8* %0) #1
+  call void @use_C(%struct.C* byval %c) #3
+  call void @llvm.lifetime.end(i64 1000, i8* %0) #1
+  ret void
+}
+
+;CHECK-LABEL: pass_S:
+;CHECK-NOT: ldrh    r{{[0-9]+}}, [{{.*}}], #2
+;CHECK-NOT: strh    r{{[0-9]+}}, [{{.*}}], #2
+define void @pass_S() #0 {
+entry:
+  %s = alloca %struct.S, align 2
+  %0 = bitcast %struct.S* %s to i8*
+  call void @llvm.lifetime.start(i64 2000, i8* %0) #1
+  call void @use_S(%struct.S* byval %s) #3
+  call void @llvm.lifetime.end(i64 2000, i8* %0) #1
+  ret void
+}
+
+;CHECK-LABEL: pass_I:
+;CHECK-NOT: ldr     r{{[0-9]+}}, [{{.*}}], #4
+;CHECK-NOT: str     r{{[0-9]+}}, [{{.*}}], #4
+define void @pass_I() #0 {
+entry:
+  %i = alloca %struct.I, align 4
+  %0 = bitcast %struct.I* %i to i8*
+  call void @llvm.lifetime.start(i64 4000, i8* %0) #1
+  call void @use_I(%struct.I* byval %i) #3
+  call void @llvm.lifetime.end(i64 4000, i8* %0) #1
+  ret void
+}
+
+declare void @use_C(%struct.C* byval) #2
+declare void @use_S(%struct.S* byval) #2
+declare void @use_I(%struct.I* byval) #2
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind optsize }
diff --git a/test/CodeGen/Thumb/barrier.ll b/test/CodeGen/Thumb/barrier.ll
index 8fca273..1c27fa0 100644
--- a/test/CodeGen/Thumb/barrier.ll
+++ b/test/CodeGen/Thumb/barrier.ll
@@ -7,7 +7,7 @@ define void @t1() {
 ; V6: blx {{_*}}sync_synchronize
 
 ; V6M-LABEL: t1:
-; V6M: dmb ish
+; V6M: dmb sy
   fence seq_cst
   ret void
 }
diff --git a/test/CodeGen/Thumb/lit.local.cfg b/test/CodeGen/Thumb/lit.local.cfg
index 4d75f58..8a3ba96 100644
--- a/test/CodeGen/Thumb/lit.local.cfg
+++ b/test/CodeGen/Thumb/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll b/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
index 486c064..1b8bdb1 100644
--- a/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
+++ b/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll
@@ -8,10 +8,10 @@ define void @t() nounwind ssp {
 entry:
 ; CHECK-LABEL: t:
   %size = mul i32 8, 2
-; CHECK:  subs  r0, #16
+; CHECK:  sub.w  r0, sp, #16
 ; CHECK:  mov sp, r0
   %vla_a = alloca i8, i32 %size, align 8
-; CHECK:  subs  r0, #16
+; CHECK:  sub.w  r0, sp, #16
 ; CHECK:  mov sp, r0
   %vla_b = alloca i8, i32 %size, align 8
   unreachable
diff --git a/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll b/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
index 244d0bb..810bfb7 100644
--- a/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
+++ b/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll
@@ -40,7 +40,7 @@ entry:
 ; CHECK: pop
 ; CHECK: pop
 ; Do not convert into single stream code. BranchProbability Analysis assumes
-; that branches which goes to "ret" intruction have lower probabilities.
+; that branches which goes to "ret" instruction have lower probabilities.
   switch i32 undef, label %bb7 [
     i32 37, label %bb43
     i32 48, label %bb5
diff --git a/test/CodeGen/Thumb2/lit.local.cfg b/test/CodeGen/Thumb2/lit.local.cfg
index cb77b09..8a3ba96 100644
--- a/test/CodeGen/Thumb2/lit.local.cfg
+++ b/test/CodeGen/Thumb2/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'ARM' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/Thumb2/tail-call-r9.ll b/test/CodeGen/Thumb2/tail-call-r9.ll
new file mode 100644
index 0000000..24c76c9
--- /dev/null
+++ b/test/CodeGen/Thumb2/tail-call-r9.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 | FileCheck %s
+
+@foo = common global void ()* null, align 4
+
+; Make sure in the presence of a tail call, r9 doesn't get used to hold
+; the destination address. It's callee-saved in AAPCS.
+define arm_aapcscc void @test(i32 %a) nounwind {
+; CHECK-LABEL: test:
+; CHECK-NOT bx r9
+  %tmp = load void ()** @foo, align 4
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r12}"() nounwind
+  tail call arm_aapcscc void %tmp() nounwind
+  ret void
+}
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
index 85943cf..13a1ca2 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
-
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -arm-no-restrict-it |FileCheck %s
 define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; CHECK-LABEL: t1:
 ; CHECK: ittt ne
@@ -74,7 +75,7 @@ entry:
 ; CHECK-LABEL: t3:
 ; CHECK: itt ge
 ; CHECK: movge r0, r1
-; CHECK: blge  _foo
+; CHECK: blge  {{_?}}foo
 	%tmp1 = icmp sgt i32 %a, 10		; <i1> [#uses=1]
 	br i1 %tmp1, label %cond_true, label %UnifiedReturnBlock
 
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
index 788fa06..403cd48 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt2.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-apple-ios -arm-no-restrict-it | FileCheck %s
 
 define void @foo(i32 %X, i32 %Y) {
 entry:
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt3.ll b/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
index bcf10ef..a71aa3f 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt3.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-default-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-apple-darwin -arm-no-restrict-it | FileCheck %s
 
 ; There shouldn't be a unconditional branch at end of bb52.
 ; rdar://7184787
diff --git a/test/CodeGen/Thumb2/thumb2-select.ll b/test/CodeGen/Thumb2/thumb2-select.ll
index 0feaf95..5f5fa19 100644
--- a/test/CodeGen/Thumb2/thumb2-select.ll
+++ b/test/CodeGen/Thumb2/thumb2-select.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -show-mc-encoding | FileCheck %s
 
 define i32 @f1(i32 %a.s) {
 entry:
@@ -66,7 +66,7 @@ define i32 @f7(i32 %a, i32 %b, i32 %c) {
 entry:
 ; CHECK-LABEL: f7:
 ; CHECK: it hi
-; CHECK: lsrhi.w
+; CHECK: lsrhi {{r[0-9]+}}
     %tmp1 = icmp ugt i32 %a, %b
     %tmp2 = udiv i32 %c, 3
     %tmp3 = select i1 %tmp1, i32 %tmp2, i32 3
@@ -77,7 +77,7 @@ define i32 @f8(i32 %a, i32 %b, i32 %c) {
 entry:
 ; CHECK-LABEL: f8:
 ; CHECK: it lo
-; CHECK: lsllo.w
+; CHECK: lsllo {{r[0-9]+}}
     %tmp1 = icmp ult i32 %a, %b
     %tmp2 = mul i32 %c, 4
     %tmp3 = select i1 %tmp1, i32 %tmp2, i32 3
@@ -96,3 +96,20 @@ entry:
     %tmp5 = select i1 %tmp1, i32 %tmp4, i32 3
     ret i32 %tmp5
 }
+
+define i32 @f10(i32 %a, i32 %b) {
+; CHECK-LABEL: f10:
+; CHECK: movwne {{r[0-9]+}}, #1234    @ encoding: [0x40,0xf2,0xd2,0x4{{[0-9a-f]+}}]
+    %tst = icmp ne i32 %a, %b
+    %val = select i1 %tst, i32 1234, i32 12345
+    ret i32 %val
+}
+
+; Make sure we pick the Thumb encoding for movw/movt
+define i32 @f11(i32 %a, i32 %b) {
+; CHECK-LABEL: f11:
+; CHECK: movwne {{r[0-9]+}}, #50033         @ encoding: [0x4c,0xf2,0x71,0x3{{[0-9a-f]+}}]
+    %tst = icmp ne i32 %a, %b
+    %val = select i1 %tst, i32 123454321, i32 543212345
+    ret i32 %val
+}
diff --git a/test/CodeGen/Thumb2/v8_IT_1.ll b/test/CodeGen/Thumb2/v8_IT_1.ll
new file mode 100644
index 0000000..30dbb48
--- /dev/null
+++ b/test/CodeGen/Thumb2/v8_IT_1.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=thumbv8 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -mattr=+neon -arm-restrict-it | FileCheck %s
+
+;CHECK-LABEL: select_s_v_v:
+;CHECK-NOT: it
+;CHECK: bx
+define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) {
+entry:
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %and = and i32 %avail, 1
+  %tobool = icmp eq i32 %and, 0
+  %vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer
+  ret <16 x i8> %vld1.
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
+
diff --git a/test/CodeGen/Thumb2/v8_IT_2.ll b/test/CodeGen/Thumb2/v8_IT_2.ll
new file mode 100644
index 0000000..170b413
--- /dev/null
+++ b/test/CodeGen/Thumb2/v8_IT_2.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
+
+	%struct.quad_struct = type { i32, i32, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct*, %struct.quad_struct* }
+
+define fastcc i32 @CountTree(%struct.quad_struct* %tree) {
+entry:
+; CHECK-LABEL: CountTree:
+; CHECK: bne
+; CHECK: cmp
+; CHECK: it eq
+; CHECK: cmpeq
+; CHECK: bne
+; CHECK: mov
+; CHECK: pop
+	br label %tailrecurse
+
+tailrecurse:		; preds = %bb, %entry
+	%tmp6 = load %struct.quad_struct** null		; <%struct.quad_struct*> [#uses=1]
+	%tmp9 = load %struct.quad_struct** null		; <%struct.quad_struct*> [#uses=2]
+	%tmp12 = load %struct.quad_struct** null		; <%struct.quad_struct*> [#uses=1]
+	%tmp14 = icmp eq %struct.quad_struct* null, null		; <i1> [#uses=1]
+	%tmp17 = icmp eq %struct.quad_struct* %tmp6, null		; <i1> [#uses=1]
+	%tmp23 = icmp eq %struct.quad_struct* %tmp9, null		; <i1> [#uses=1]
+	%tmp29 = icmp eq %struct.quad_struct* %tmp12, null		; <i1> [#uses=1]
+	%bothcond = and i1 %tmp17, %tmp14		; <i1> [#uses=1]
+	%bothcond1 = and i1 %bothcond, %tmp23		; <i1> [#uses=1]
+	%bothcond2 = and i1 %bothcond1, %tmp29		; <i1> [#uses=1]
+	br i1 %bothcond2, label %return, label %bb
+
+bb:		; preds = %tailrecurse
+	%tmp41 = tail call fastcc i32 @CountTree( %struct.quad_struct* %tmp9 )		; <i32> [#uses=0]
+	br label %tailrecurse
+
+return:		; preds = %tailrecurse
+	ret i32 0
+}
+
diff --git a/test/CodeGen/Thumb2/v8_IT_3.ll b/test/CodeGen/Thumb2/v8_IT_3.ll
new file mode 100644
index 0000000..4dca246
--- /dev/null
+++ b/test/CodeGen/Thumb2/v8_IT_3.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8 -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it -relocation-model=pic | FileCheck %s --check-prefix=CHECK-PIC
+
+%struct.FF = type { i32 (i32*)*, i32 (i32*, i32*, i32, i32, i32, i32)*, i32 (i32, i32, i8*)*, void ()*, i32 (i32, i8*, i32*)*, i32 ()* }
+%struct.BD = type { %struct.BD*, i32, i32, i32, i32, i64, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i8*, i64, i32)*, i32 (%struct.BD*, i8*, i32, i32)*, i32 (%struct.BD*, i64, i32)*, [16 x i8], i64, i64 }
+
+@FuncPtr = external hidden unnamed_addr global %struct.FF*
+@.str1 = external hidden unnamed_addr constant [6 x i8], align 4
+@G = external unnamed_addr global i32
+@.str2 = external hidden unnamed_addr constant [58 x i8], align 4
+@.str3 = external hidden unnamed_addr constant [58 x i8], align 4
+
+define i32 @test() nounwind optsize ssp {
+entry:
+; CHECK-LABEL: test:
+; CHECK: push
+; CHECK-NOT: push
+  %block_size = alloca i32, align 4
+  %block_count = alloca i32, align 4
+  %index_cache = alloca i32, align 4
+  store i32 0, i32* %index_cache, align 4
+  %tmp = load i32* @G, align 4
+  %tmp1 = call i32 @bar(i32 0, i32 0, i32 %tmp) nounwind
+  switch i32 %tmp1, label %bb8 [
+    i32 0, label %bb
+    i32 536870913, label %bb4
+    i32 536870914, label %bb6
+  ]
+
+bb:
+  %tmp2 = load i32* @G, align 4
+  %tmp4 = icmp eq i32 %tmp2, 0
+  br i1 %tmp4, label %bb1, label %bb8
+
+bb1:
+; CHECK: %bb6
+; CHECK: it	eq
+; CHECK-NEXT: ldreq
+; CHECK-NEXT: it	eq
+; CHECK-NEXT: cmpeq
+; CHECK: %bb1
+  %tmp5 = load i32* %block_size, align 4
+  %tmp6 = load i32* %block_count, align 4
+  %tmp7 = call %struct.FF* @Get() nounwind
+  store %struct.FF* %tmp7, %struct.FF** @FuncPtr, align 4
+  %tmp10 = zext i32 %tmp6 to i64
+  %tmp11 = zext i32 %tmp5 to i64
+  %tmp12 = mul nsw i64 %tmp10, %tmp11
+  %tmp13 = call i32 @foo(i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), i64 %tmp12, i32 %tmp5) nounwind
+  br label %bb8
+
+bb4:
+; CHECK-PIC: cmp
+; CHECK-PIC: cmp
+; CHECK-PIC-NEXT: bne
+; CHECK-PIC-NEXT: %bb4
+; CHECK-PIC-NEXT: movs
+; CHECK-PIC-NEXT: add
+; CHECK-PIC-NEXT: pop
+  ret i32 0
+
+bb6:
+  ret i32 1
+
+bb8:
+  ret i32 -1
+}
+
+declare i32 @printf(i8*, ...)
+
+declare %struct.FF* @Get()
+
+declare i32 @foo(i8*, i64, i32)
+
+declare i32 @bar(i32, i32, i32)
diff --git a/test/CodeGen/Thumb2/v8_IT_4.ll b/test/CodeGen/Thumb2/v8_IT_4.ll
new file mode 100644
index 0000000..5a80d8c
--- /dev/null
+++ b/test/CodeGen/Thumb2/v8_IT_4.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=thumbv8-eabi -float-abi=hard | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-eabi -float-abi=hard -arm-restrict-it | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv8-eabi -float-abi=hard -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-eabi -float-abi=hard -regalloc=basic -arm-restrict-it | FileCheck %s
+
+%"struct.__gnu_cxx::__normal_iterator<char*,std::basic_string<char, std::char_traits<char>, std::allocator<char> > >" = type { i8* }
+%"struct.__gnu_cxx::new_allocator<char>" = type <{ i8 }>
+%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >" = type { %"struct.__gnu_cxx::__normal_iterator<char*,std::basic_string<char, std::char_traits<char>, std::allocator<char> > >" }
+%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >::_Rep" = type { %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >::_Rep_base" }
+%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >::_Rep_base" = type { i32, i32, i32 }
+
+
+define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) {
+; CHECK-LABEL: _ZNKSs7compareERKSs:
+; CHECK:      cbnz	r0,
+; CHECK-NEXT: %bb
+; CHECK-NEXT: sub{{(.w)?}} r0, r{{[0-9]+}}, r{{[0-9]+}}
+; CHECK-NEXT: %bb1
+; CHECK-NEXT: pop.w
+entry:
+  %0 = tail call arm_aapcs_vfpcc  i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this) ; <i32> [#uses=3]
+  %1 = tail call arm_aapcs_vfpcc  i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) ; <i32> [#uses=3]
+  %2 = icmp ult i32 %1, %0                        ; <i1> [#uses=1]
+  %3 = select i1 %2, i32 %1, i32 %0               ; <i32> [#uses=1]
+  %4 = tail call arm_aapcs_vfpcc  i8* @_ZNKSs7_M_dataEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this) ; <i8*> [#uses=1]
+  %5 = tail call arm_aapcs_vfpcc  i8* @_ZNKSs4dataEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) ; <i8*> [#uses=1]
+  %6 = tail call arm_aapcs_vfpcc  i32 @memcmp(i8* %4, i8* %5, i32 %3) nounwind readonly ; <i32> [#uses=2]
+  %7 = icmp eq i32 %6, 0                          ; <i1> [#uses=1]
+  br i1 %7, label %bb, label %bb1
+
+bb:                                               ; preds = %entry
+  %8 = sub i32 %0, %1                             ; <i32> [#uses=1]
+  ret i32 %8
+
+bb1:                                              ; preds = %entry
+  ret i32 %6
+}
+
+declare arm_aapcs_vfpcc i32 @memcmp(i8* nocapture, i8* nocapture, i32) nounwind readonly
+
+declare arm_aapcs_vfpcc i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this)
+
+declare arm_aapcs_vfpcc i8* @_ZNKSs7_M_dataEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this)
+
+declare arm_aapcs_vfpcc i8* @_ZNKSs4dataEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this)
diff --git a/test/CodeGen/Thumb2/v8_IT_5.ll b/test/CodeGen/Thumb2/v8_IT_5.ll
new file mode 100644
index 0000000..30250c8
--- /dev/null
+++ b/test/CodeGen/Thumb2/v8_IT_5.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -mtriple=thumbv8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck %s
+; CHECK: it	ne
+; CHECK-NEXT: cmpne
+; CHECK-NEXT: beq
+; CHECK: cmp
+; CHECK-NEXT: beq
+; CHECK-NEXT: %if.else163
+; CHECK-NEXT: mov.w
+; CHECK-NEXT: b
+; CHECK-NEXT: %if.else145
+; CHECK-NEXT: mov.w
+
+%struct.hc = type { i32, i32, i32, i32 }
+
+define i32 @t(i32 %type) optsize {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  unreachable
+
+if.else:
+  br i1 undef, label %if.then15, label %if.else18
+
+if.then15:
+  unreachable
+
+if.else18:
+  switch i32 %type, label %if.else173 [
+    i32 3, label %if.then115
+    i32 1, label %if.then102
+  ]
+
+if.then102:
+  br i1 undef, label %cond.true10.i, label %t.exit
+
+cond.true10.i:
+  br label %t.exit
+
+t.exit:
+  unreachable
+
+if.then115:
+  br i1 undef, label %if.else163, label %if.else145
+
+if.else145:
+  %call150 = call fastcc %struct.hc* @foo(%struct.hc* undef, i32 34865152) optsize
+  br label %while.body172
+
+if.else163:
+  %call168 = call fastcc %struct.hc* @foo(%struct.hc* undef, i32 34078720) optsize
+  br label %while.body172
+
+while.body172:
+  br label %while.body172
+
+if.else173:
+  ret i32 -1
+}
+
+declare hidden fastcc %struct.hc* @foo(%struct.hc* nocapture, i32) nounwind optsize
+
diff --git a/test/CodeGen/X86/2006-05-02-InstrSched1.ll b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
index 0afddd8..69266dc 100644
--- a/test/CodeGen/X86/2006-05-02-InstrSched1.ll
+++ b/test/CodeGen/X86/2006-05-02-InstrSched1.ll
@@ -1,7 +1,10 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -march=x86 -relocation-model=static -stats 2>&1 | \
-; RUN:   grep asm-printer | grep 14
+; RUN:   grep asm-printer | grep 16
 ;
+; It's possible to schedule this in 14 instructions by avoiding
+; callee-save registers, but the scheduler isn't currently that
+; conervative with registers.
 @size20 = external global i32		; <i32*> [#uses=1]
 @in5 = external global i8*		; <i8**> [#uses=1]
 
@@ -21,4 +24,3 @@ define i32 @compare(i8* %a, i8* %b) nounwind {
 }
 
 declare i32 @memcmp(i8*, i8*, i32)
-
diff --git a/test/CodeGen/X86/2007-01-08-InstrSched.ll b/test/CodeGen/X86/2007-01-08-InstrSched.ll
index 24aa5b9..4ec7039 100644
--- a/test/CodeGen/X86/2007-01-08-InstrSched.ll
+++ b/test/CodeGen/X86/2007-01-08-InstrSched.ll
@@ -13,10 +13,10 @@ define float @foo(float %x) nounwind {
 
 ; CHECK: mulss
 ; CHECK: mulss
-; CHECK: addss
 ; CHECK: mulss
-; CHECK: addss
 ; CHECK: mulss
 ; CHECK: addss
+; CHECK: addss
+; CHECK: addss
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
index c5d2a46..638d399 100644
--- a/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
+++ b/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll
@@ -1,4 +1,13 @@
-; RUN: llc < %s -march=x86 | grep weak | count 2
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu | FileCheck %s
+
 @__gthrw_pthread_once = alias weak i32 (i32*, void ()*)* @pthread_once		; <i32 (i32*, void ()*)*> [#uses=0]
 
-declare extern_weak i32 @pthread_once(i32*, void ()*)
+define weak i32 @pthread_once(i32*, void ()*) {
+  ret i32 0
+}
+
+; CHECK: .weak   pthread_once
+; CHECK: pthread_once:
+
+; CHECK: .weak   __gthrw_pthread_once
+; CHECK: __gthrw_pthread_once = pthread_once
diff --git a/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll b/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
index 7a3d72d..1ec9c70 100644
--- a/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
+++ b/test/CodeGen/X86/2007-10-12-SpillerUnfold2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep sarl | not grep esp
+; RUN: llc < %s -march=x86 -mcpu=corei7 | grep sarl | not grep esp
 
 define signext   i16 @t(i16* %qmatrix, i16* %dct, i16* %acBaseTable, i16* %acExtTable, i16 signext  %acBaseRes, i16 signext  %acMaskRes, i16 signext  %acExtRes, i32* %bitptr, i32* %source, i32 %markerPrefix, i8** %byteptr, i32 %scale, i32 %round, i32 %bits) {
 entry:
diff --git a/test/CodeGen/X86/2008-03-14-SpillerCrash.ll b/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
index 8946415..18b3714 100644
--- a/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
+++ b/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
@@ -45,4 +45,6 @@ bb383:		; preds = %bb374.us, %bb311.split
 	ret i64 0
 }
 
-declare i64 @__wcstoll_l(i32*, i32**, i32, %struct.__locale_struct*) nounwind 
+define i64 @__wcstoll_l(i32*, i32**, i32, %struct.__locale_struct*) nounwind {
+  ret i64 0
+}
diff --git a/test/CodeGen/X86/2008-04-24-pblendw-fold-crash.ll b/test/CodeGen/X86/2008-04-24-pblendw-fold-crash.ll
index 4eaca17..86bce8e 100644
--- a/test/CodeGen/X86/2008-04-24-pblendw-fold-crash.ll
+++ b/test/CodeGen/X86/2008-04-24-pblendw-fold-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mattr=+sse41
+; RUN: llc < %s -mattr=+sse4.1
 ; rdar://5886601
 ; gcc testsuite:  gcc.target/i386/sse4_1-pblendw.c
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
index ecd8663..296f0ca 100644
--- a/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ b/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -67,17 +67,16 @@ declare i64 @strlen(i8*) nounwind readonly
 declare void @llvm.stackrestore(i8*) nounwind
 
 !0 = metadata !{i32 459009, metadata !1, metadata !"s1", metadata !2, i32 2, metadata !6} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 458798, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !3, i1 false, i1 true,
-                i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 458798, i32 0, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 458769, metadata !17, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 458773, null, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 458773, null, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5, metadata !6}
 !5 = metadata !{i32 458788, null, metadata !2, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
 !7 = metadata !{i32 2, i32 0, metadata !1, null}
 !8 = metadata !{i32 459008, metadata !1, metadata !"str.0", metadata !2, i32 3, metadata !9} ; [ DW_TAG_auto_variable ]
 !9 = metadata !{i32 458767, null, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !10} ; [ DW_TAG_pointer_type ]
-!10 = metadata !{i32 458753, null, metadata !2, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !5, metadata !11, i32 0, null} ; [ DW_TAG_array_type ]
+!10 = metadata !{i32 458753, null, metadata !2, metadata !"", i32 0, i64 8, i64 8, i64 0, i32 0, metadata !5, metadata !11, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 8, align 8, offset 0] [from char]
 !11 = metadata !{metadata !12}
 !12 = metadata !{i32 458785, i64 0, i64 1}        ; [ DW_TAG_subrange_type ]
 !13 = metadata !{i32 3, i32 0, metadata !14, null}
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 8174fbd..764c2cd 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -mcpu=penryn -stats 2>&1 | grep "4 machine-licm"
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse41 -mcpu=penryn | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "4 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
 
@@ -17,9 +17,9 @@ bb4:		; preds = %bb.i, %bb26, %bb4, %entry
 ; CHECK: %bb4
 ; CHECK: xorl
 ; CHECK: callq
-; CHECK: movq
 ; CHECK: xorl
 ; CHECK: xorl
+; CHECK: movq
 
 	%0 = call i32 (...)* @xxGetOffsetForCode(i32 undef) nounwind		; <i32> [#uses=0]
 	%ins = or i64 %p, 2097152		; <i64> [#uses=1]
diff --git a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
index 5cb05e8..e1930e0 100644
--- a/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
+++ b/test/CodeGen/X86/2009-04-21-NoReloadImpDef.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=i386-apple-darwin10.0 -relocation-model=pic -asm-verbose=false \
-; RUN:     -mcpu=generic -disable-fp-elim -mattr=-sse41,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
+; RUN:     -mcpu=generic -disable-fp-elim -mattr=-sse4.1,-sse3,+sse2 -post-RA-scheduler=false -regalloc=basic < %s | \
 ; RUN:   FileCheck %s
 ; rdar://6808032
 
diff --git a/test/CodeGen/X86/2009-10-16-Scope.ll b/test/CodeGen/X86/2009-10-16-Scope.ll
index ae2e9ac..a936edc 100644
--- a/test/CodeGen/X86/2009-10-16-Scope.ll
+++ b/test/CodeGen/X86/2009-10-16-Scope.ll
@@ -24,8 +24,7 @@ declare i32 @foo(i32) ssp
 
 !0 = metadata !{i32 5, i32 2, metadata !1, null}
 !1 = metadata !{i32 458763, null, metadata !2, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
-!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true,
-               i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 458798, i32 0, metadata !3, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, null, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !3 = metadata !{i32 458769, metadata !8, i32 12, metadata !"clang 1.1", i1 true, metadata !"", i32 0, null, metadata !9, null, null, null, metadata !""}; [DW_TAG_compile_unit ]
 !4 = metadata !{i32 459008, metadata !5, metadata !"count_", metadata !3, i32 5, metadata !6}; [ DW_TAG_auto_variable ]
 !5 = metadata !{i32 458763, null, metadata !1, i32 1, i32 1, i32 0}; [DW_TAG_lexical_block ]
diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll
index c54f030..f99e682 100644
--- a/test/CodeGen/X86/2010-01-18-DbgValue.ll
+++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll
@@ -29,18 +29,19 @@ return:                                           ; preds = %entry
 declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!21}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"my_r0", metadata !2, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 11, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (%struct.Rect*)* @foo, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !19, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !7}
 !6 = metadata !{i32 786468, metadata !19, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null} ; [ DW_TAG_structure_type ]
+!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Rect] [line 6, size 256, align 64, offset 0] [def] [from ]
 !8 = metadata !{metadata !9, metadata !14}
 !9 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P1", i32 7, i64 128, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_structure_type ]
+!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [Pt] [line 1, size 128, align 64, offset 0] [def] [from ]
 !11 = metadata !{metadata !12, metadata !13}
 !12 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"x", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
 !13 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"y", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
@@ -51,3 +52,4 @@ declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
 !18 = metadata !{metadata !1}
 !19 = metadata !{metadata !"b2.c", metadata !"/tmp/"}
 !20 = metadata !{i32 0}
+!21 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
index 71c7b65..4d4e8c1 100644
--- a/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
+++ b/test/CodeGen/X86/2010-02-01-DbgValueCrash.ll
@@ -18,7 +18,7 @@ declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.
 
 !0 = metadata !{i32 458769, metadata !15, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !16, metadata !16, null, null, null, i32 0} ; [ DW_TAG_compile_unit ]
 !1 = metadata !{i32 458790, metadata !15, metadata !0, metadata !"", i32 0, i64 192, i64 64, i64 0, i32 0, metadata !2} ; [ DW_TAG_const_type ]
-!2 = metadata !{i32 458771, metadata !15, metadata !0, metadata !"C", i32 1, i64 192, i64 64, i64 0, i32 0, null, metadata !3, i32 0, null} ; [ DW_TAG_structure_type ]
+!2 = metadata !{i32 458771, metadata !15, metadata !0, metadata !"C", i32 1, i64 192, i64 64, i64 0, i32 0, null, metadata !3, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 1, size 192, align 64, offset 0] [def] [from ]
 !3 = metadata !{metadata !4, metadata !6, metadata !7}
 !4 = metadata !{i32 458765, metadata !15, metadata !2, metadata !"x", i32 1, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
 !5 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
@@ -27,7 +27,7 @@ declare void @"tart.reflect.ComplexType.create->tart.core.Object"(%tart.reflect.
 !8 = metadata !{i32 459008, metadata !9, metadata !"t", metadata !0, i32 5, metadata !2} ; [ DW_TAG_auto_variable ]
 !9 = metadata !{i32 458763, null, metadata !10, i32 0, i32 0, i32 0}        ; [ DW_TAG_lexical_block ]
 !10 = metadata !{i32 458798, i32 0, metadata !0, metadata !"foo", metadata !"foo", metadata !"foo", i32 4, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 458773, metadata !15, metadata !0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!11 = metadata !{i32 458773, metadata !15, metadata !0, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !12 = metadata !{metadata !13}
 !13 = metadata !{i32 458788, metadata !15, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !14 = metadata !{%tart.reflect.ComplexType* @.type.SwitchStmtTest}
diff --git a/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll b/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll
index d4a74c9..060c535 100644
--- a/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll
+++ b/test/CodeGen/X86/2010-02-19-TailCallRetAddrBug.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mcpu=generic -mtriple=i386-apple-darwin -tailcallopt < %s | FileCheck %s
+; RUN: llc -mcpu=generic -mtriple=i386-apple-darwin -tailcallopt -enable-misched=false < %s | FileCheck %s
 ; Check that lowered argumens do not overwrite the return address before it is moved.
 ; Bug 6225
 ;
 ; If a call is a fastcc tail call and tail call optimization is enabled, the
-; caller frame is replaced by the callee frame. This can require that arguments are 
+; caller frame is replaced by the callee frame. This can require that arguments are
 ; placed on the former return address stack slot. Special care needs to be taken
 ; taken that the return address is moved / or stored in a register before
 ; lowering of arguments potentially overwrites the value.
@@ -51,5 +51,3 @@ false:
   tail call fastcc void @l298(i32 %r10, i32 %r9, i32 %r4) noreturn nounwind
   ret void
 }
-
-
diff --git a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
index 00ac71a..7faee99 100644
--- a/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-25-DotDebugLoc.ll
@@ -199,12 +199,13 @@ declare float @copysignf(float, float) nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!48}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"a", metadata !2, i32 1921, metadata !9, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !45, metadata !2, metadata !"__divsc3", metadata !"__divsc3", metadata !"__divsc3", i32 1922, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, %0 (float, float, float, float)* @__divsc3, null, null, metadata !43, i32 1922} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !45} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !45, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !47, metadata !47, metadata !44, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !45, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !45, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !9, metadata !9, metadata !9, metadata !9}
 !6 = metadata !{i32 786454, metadata !46, metadata !7, metadata !"SCtype", i32 170, i64 0, i64 0, i64 0, i32 0, metadata !8} ; [ DW_TAG_typedef ]
 !7 = metadata !{i32 786473, metadata !46} ; [ DW_TAG_file_type ]
@@ -248,3 +249,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !45 = metadata !{metadata !"libgcc2.c", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
 !46 = metadata !{metadata !"libgcc2.h", metadata !"/Users/yash/clean/LG.D/gcc/../../llvmgcc/gcc"}
 !47 = metadata !{i32 0}
+!48 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index 4b1dfb3..c5736eb 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -22,6 +22,7 @@ declare void @foo(i32) nounwind optsize noinline ssp
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!38}
 
 !0 = metadata !{i32 786484, i32 0, metadata !1, metadata !"ret", metadata !"ret", metadata !"", metadata !1, i32 7, metadata !3, i1 false, i1 true, null, null} ; [ DW_TAG_variable ]
 !1 = metadata !{i32 786473, metadata !36} ; [ DW_TAG_file_type ]
@@ -29,21 +30,21 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !3 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !4 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !1, i32 12, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
 !5 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, void (i32)* @foo, null, null, metadata !33, i32 13} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !7 = metadata !{null, metadata !3}
 !8 = metadata !{i32 786689, metadata !9, metadata !"myvar", metadata !1, i32 17, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
 !9 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"bar", metadata !"bar", metadata !"bar", i32 17, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i8* (%struct.a*)* @bar, null, null, metadata !34, i32 17} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!10 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{metadata !12, metadata !13}
 !12 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ]
 !13 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786451, metadata !36, metadata !1, metadata !"a", i32 2, i64 128, i64 64, i64 0, i32 0, null, metadata !15, i32 0, null} ; [ DW_TAG_structure_type ]
+!14 = metadata !{i32 786451, metadata !36, metadata !1, metadata !"a", i32 2, i64 128, i64 64, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [a] [line 2, size 128, align 64, offset 0] [def] [from ]
 !15 = metadata !{metadata !16, metadata !17}
 !16 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"c", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !3} ; [ DW_TAG_member ]
 !17 = metadata !{i32 786445, metadata !36, metadata !14, metadata !"d", i32 4, i64 64, i64 64, i64 64, i32 0, metadata !13} ; [ DW_TAG_member ]
 !18 = metadata !{i32 786689, metadata !19, metadata !"argc", metadata !1, i32 22, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
 !19 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 22, metadata !20, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, null, null, null, metadata !35, i32 22} ; [ DW_TAG_subprogram ]
-!20 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!20 = metadata !{i32 786453, metadata !36, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !21, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !21 = metadata !{metadata !3, metadata !3, metadata !22}
 !22 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
 !23 = metadata !{i32 786447, metadata !36, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !24} ; [ DW_TAG_pointer_type ]
@@ -86,3 +87,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 ; CHECK-NEXT: Ltmp{{.*}}:
 ; CHECK-NEXT: .byte   83
 ; CHECK-NEXT: Ltmp{{.*}}:
+!38 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-05-28-Crash.ll b/test/CodeGen/X86/2010-05-28-Crash.ll
index d5c0ead..1114c8d 100644
--- a/test/CodeGen/X86/2010-05-28-Crash.ll
+++ b/test/CodeGen/X86/2010-05-28-Crash.ll
@@ -23,12 +23,13 @@ entry:
 }
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!20}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"y", metadata !2, i32 2, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !18, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 2, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 (i32)* @foo, null, null, metadata !15, i32 2} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !18} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !18, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !19, metadata !19, metadata !17, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !18, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6, metadata !6}
 !6 = metadata !{i32 786468, metadata !18, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !2, i32 6, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -48,3 +49,4 @@ entry:
 ;CHECK: DEBUG_VALUE: bar:x <- E
 ;CHECK: Ltmp
 ;CHECK:	DEBUG_VALUE: foo:y <- 1{{$}}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
index 1571a58..b45ac22 100644
--- a/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
+++ b/test/CodeGen/X86/2010-06-01-DeadArg-DbgInfo.ll
@@ -20,18 +20,19 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!4}
+!llvm.module.flags = !{!34}
 !llvm.dbg.lv = !{!0, !14, !15, !16, !17, !24, !25, !28}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"this", metadata !3, i32 11, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"bar", metadata !"bar", metadata !"_ZN3foo3barEi", i32 11, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786451, metadata !31, metadata !3, metadata !"foo", i32 3, i64 32, i64 32, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_structure_type ]
+!2 = metadata !{i32 786451, metadata !31, metadata !3, metadata !"foo", i32 3, i64 32, i64 32, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [foo] [line 3, size 32, align 32, offset 0] [def] [from ]
 !3 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ]
 !4 = metadata !{i32 786449, metadata !31, i32 4, metadata !"4.2.1 LLVM build", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !33, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !5 = metadata !{metadata !6, metadata !1, metadata !8}
 !6 = metadata !{i32 786445, metadata !31, metadata !2, metadata !"y", i32 8, i64 32, i64 32, i64 0, i32 0, metadata !7} ; [ DW_TAG_member ]
 !7 = metadata !{i32 786468, metadata !31, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !8 = metadata !{i32 786478, metadata !31, metadata !2, metadata !"baz", metadata !"baz", metadata !"_ZN3foo3bazEi", i32 15, metadata !9, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 (%struct.foo*, i32)* @_ZN3foo3bazEi, null, null, null, i32 15} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!9 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !10 = metadata !{metadata !7, metadata !11, metadata !7}
 !11 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !2} ; [ DW_TAG_pointer_type ]
 !12 = metadata !{i32 786470, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !13} ; [ DW_TAG_const_type ]
@@ -41,7 +42,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !16 = metadata !{i32 786689, metadata !8, metadata !"x", metadata !3, i32 15, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !17 = metadata !{i32 786689, metadata !18, metadata !"argc", metadata !3, i32 19, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !18 = metadata !{i32 786478, metadata !31, metadata !3, metadata !"main", metadata !"main", metadata !"main", i32 19, metadata !19, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, null, null, null, null, i32 19} ; [ DW_TAG_subprogram ]
-!19 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!19 = metadata !{i32 786453, metadata !31, metadata !3, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !20, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !20 = metadata !{metadata !7, metadata !7, metadata !21}
 !21 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !22} ; [ DW_TAG_pointer_type ]
 !22 = metadata !{i32 786447, metadata !31, metadata !3, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !23} ; [ DW_TAG_pointer_type ]
@@ -56,3 +57,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !31 = metadata !{metadata !"foo.cp", metadata !"/tmp/"}
 !32 = metadata !{i32 0}
 !33 = metadata !{metadata !1, metadata !8, metadata !18}
+!34 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-07-06-DbgCrash.ll b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
index e91cd76..b49aec3 100644
--- a/test/CodeGen/X86/2010-07-06-DbgCrash.ll
+++ b/test/CodeGen/X86/2010-07-06-DbgCrash.ll
@@ -7,15 +7,14 @@
 !39 = metadata !{i32 524305, metadata !109, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build 9999)", i1 true, metadata !"", i32 0, metadata !108, metadata !108, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !46 = metadata !{i32 524303, metadata !109, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !47} ; [ DW_TAG_pointer_type ]
 !47 = metadata !{i32 524324, metadata !109, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!97 = metadata !{i32 524334, i32 0, metadata !39, metadata !"main", metadata !"main", metadata !"main", i32 73, metadata !98, i1 false, i1 true,
-                i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!98 = metadata !{i32 524309, metadata !109, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !99, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!97 = metadata !{i32 524334, i32 0, metadata !39, metadata !"main", metadata !"main", metadata !"main", i32 73, metadata !98, i1 false, i1 true, i32 0, i32 0, null, i32 0, i32 0, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!98 = metadata !{i32 524309, metadata !109, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !99, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !99 = metadata !{metadata !100}
 !100 = metadata !{i32 524324, metadata !109, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !101 = metadata !{[2 x i8*]* @C.9.2167}
 !102 = metadata !{i32 524544, metadata !103, metadata !"find_strings", metadata !38, i32 75, metadata !104, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
 !103 = metadata !{i32 524299, null, metadata !97, i32 73, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!104 = metadata !{i32 524289, metadata !109, null, metadata !"", i32 0, i64 85312, i64 64, i64 0, i32 0, metadata !46, metadata !105, i32 0, null} ; [ DW_TAG_array_type ]
+!104 = metadata !{i32 524289, metadata !109, null, metadata !"", i32 0, i64 85312, i64 64, i64 0, i32 0, metadata !46, metadata !105, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 85312, align 64, offset 0] [from ]
 !105 = metadata !{metadata !106}
 !106 = metadata !{i32 524321, i64 0, i64 1333}    ; [ DW_TAG_subrange_type ]
 !107 = metadata !{i32 73, i32 0, metadata !103, null}
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index c6e1654..91fec3b 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -75,10 +75,11 @@ return:                                           ; preds = %entry
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!49}
 !46 = metadata !{metadata !0, metadata !9, metadata !16, metadata !17, metadata !20}
 
 !0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_structure_type ]
+!1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
 !2 = metadata !{i32 786473, metadata !47} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !47, i32 4, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !48, metadata !48, metadata !46, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
 !4 = metadata !{metadata !5, metadata !7, metadata !0, metadata !9}
@@ -87,18 +88,18 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !7 = metadata !{i32 786445, metadata !47, metadata !1, metadata !"Kind", i32 8, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ]
 !8 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
 !9 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"~SVal", metadata !"~SVal", metadata !"", i32 12, metadata !10, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 12} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!10 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !11 = metadata !{null, metadata !12, metadata !13}
 !12 = metadata !{i32 786447, metadata !47, metadata !2, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
 !13 = metadata !{i32 786468, metadata !47, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!14 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !15, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !15 = metadata !{null, metadata !12}
 !16 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"_ZN4SValC1Ev", i32 11, metadata !14, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, void (%struct.SVal*)* @_ZN4SValC1Ev, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
 !17 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"foo", metadata !"foo", metadata !"_Z3fooi4SVal", i32 16, metadata !18, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, %struct.SVal*)* @_Z3fooi4SVal, null, null, null, i32 16} ; [ DW_TAG_subprogram ]
-!18 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!18 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !19, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !19 = metadata !{metadata !13, metadata !13, metadata !1}
 !20 = metadata !{i32 786478, metadata !47, metadata !2, metadata !"main", metadata !"main", metadata !"main", i32 23, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @main, null, null, null, i32 23} ; [ DW_TAG_subprogram ]
-!21 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!21 = metadata !{i32 786453, metadata !47, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !22 = metadata !{metadata !13}
 !23 = metadata !{i32 786689, metadata !17, metadata !"i", metadata !2, i32 16, metadata !13, i32 0, null} ; [ DW_TAG_arg_variable ]
 !24 = metadata !{i32 16, i32 0, metadata !17, null}
@@ -125,3 +126,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !45 = metadata !{i32 27, i32 0, metadata !39, null}
 !47 = metadata !{metadata !"small.cc", metadata !"/Users/manav/R8248330"}
 !48 = metadata !{i32 0}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index 831fe66..9aa41c3 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -13,11 +13,12 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!17}
 
 !0 = metadata !{i32 786478, metadata !14, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 53, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !15, i32 12, metadata !"clang version 2.9 (trunk 114084)", i1 false, metadata !"", i32 0, metadata !16, metadata !16, metadata !13, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, metadata !13, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !14, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, metadata !14, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786478, metadata !15, metadata !7, metadata !"bar", metadata !"bar", metadata !"bar", i32 4, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 ()* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
@@ -31,3 +32,4 @@ entry:
 !14 = metadata !{metadata !"", metadata !"/private/tmp"}
 !15 = metadata !{metadata !"bug.c", metadata !"/private/tmp"}
 !16 = metadata !{i32 0}
+!17 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
index 1b33977..39d89e3 100644
--- a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
+++ b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
@@ -19,8 +19,8 @@ entry:
 }
 
 ; CHECK: movq	___stack_chk_guard@GOTPCREL(%rip)
-; CHECK: movb   38(%rsp), [[R0:%.+]]
-; CHECK: movb   8(%rsp), [[R1:%.+]]
-; CHECK: movb   [[R1]], 8(%rsp)
-; CHECK: movb   [[R0]], 38(%rsp)
+; CHECK: movb   (%rsp), [[R1:%.+]]
+; CHECK: movb   30(%rsp), [[R0:%.+]]
+; CHECK: movb   [[R1]], (%rsp)
+; CHECK: movb   [[R0]], 30(%rsp)
 ; CHECK: callq	___stack_chk_fail
diff --git a/test/CodeGen/X86/2010-11-02-DbgParameter.ll b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
index e118e80..21ac7c9 100644
--- a/test/CodeGen/X86/2010-11-02-DbgParameter.ll
+++ b/test/CodeGen/X86/2010-11-02-DbgParameter.ll
@@ -16,16 +16,17 @@ entry:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!19}
 
 !0 = metadata !{i32 786478, metadata !17, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (%struct.bar*)* @foo, null, null, metadata !16, i32 3} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !17} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !17, i32 12, metadata !"clang version 2.9 (trunk 117922)", i1 true, metadata !"", i32 0, metadata !18, metadata !18, metadata !15, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !17, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !17, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, metadata !17, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786689, metadata !0, metadata !"i", metadata !1, i32 3, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
 !7 = metadata !{i32 786447, metadata !17, metadata !1, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_pointer_type ]
-!8 = metadata !{i32 786451, metadata !17, metadata !1, metadata !"bar", i32 2, i64 64, i64 32, i64 0, i32 0, null, metadata !9, i32 0, null} ; [ DW_TAG_structure_type ]
+!8 = metadata !{i32 786451, metadata !17, metadata !1, metadata !"bar", i32 2, i64 64, i64 32, i64 0, i32 0, null, metadata !9, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [bar] [line 2, size 64, align 32, offset 0] [def] [from ]
 !9 = metadata !{metadata !10, metadata !11}
 !10 = metadata !{i32 786445, metadata !17,  metadata !1, metadata !"x", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !5} ; [ DW_TAG_member ]
 !11 = metadata !{i32 786445, metadata !17, metadata !1, metadata !"y", i32 2, i64 32, i64 32, i64 32, i32 0, metadata !5} ; [ DW_TAG_member ]
@@ -36,3 +37,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !16 = metadata !{metadata !6}
 !17 = metadata !{metadata !"one.c", metadata !"/private/tmp"}
 !18 = metadata !{i32 0}
+!19 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2010-12-02-MC-Set.ll b/test/CodeGen/X86/2010-12-02-MC-Set.ll
index 1a4c586..5a407d3 100644
--- a/test/CodeGen/X86/2010-12-02-MC-Set.ll
+++ b/test/CodeGen/X86/2010-12-02-MC-Set.ll
@@ -7,12 +7,13 @@ entry:
 }
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!10}
 !7 = metadata !{metadata !0}
 
 !0 = metadata !{i32 786478, metadata !9, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786473, metadata !9} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !9, i32 12, metadata !"clang version 2.9 (trunk 120563)", i1 false, metadata !"", i32 0, metadata !8, metadata !8, metadata !7, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !9, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !9, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{null}
 !5 = metadata !{i32 5, i32 1, metadata !6, null}
 !6 = metadata !{i32 786443, metadata !9, metadata !0, i32 3, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
@@ -23,3 +24,4 @@ entry:
 ; CHECK-NEXT: __debug_line
 ; CHECK-NEXT: Lline_table_start0
 ; CHECK-NEXT: Ltmp{{[0-9]}} = (Ltmp
+!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index 3e0fbca..d534030 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -70,15 +70,16 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 declare i32 @puts(i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!33}
 
-!0 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"gcd", metadata !"gcd", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i64 (i64, i64)* @gcd, null, null, metadata !29, i32 0} ; [ DW_TAG_subprogram ]
+!0 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"gcd", metadata !"gcd", metadata !"", i32 5, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i64 (i64, i64)* @gcd, null, null, metadata !29, i32 0} ; [ DW_TAG_subprogram ] [line 5] [def] [scope 0] [gcd]
 !1 = metadata !{i32 786473, metadata !31} ; [ DW_TAG_file_type ]
 !2 = metadata !{i32 786449, metadata !31, i32 12, metadata !"clang version 2.9 (trunk 124117)", i1 true, metadata !"", i32 0, metadata !32, metadata !32, metadata !28, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !2, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 25, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 true, i32 ()* @main, null, null, metadata !30, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!6 = metadata !{i32 786478, metadata !31, metadata !1, metadata !"main", metadata !"main", metadata !"", i32 25, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 true, i32 ()* @main, null, null, metadata !30, i32 0} ; [ DW_TAG_subprogram ] [line 25] [def] [scope 0] [main]
+!7 = metadata !{i32 786453, metadata !31, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !8, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !8 = metadata !{metadata !9}
 !9 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !10 = metadata !{i32 786689, metadata !0, metadata !"a", metadata !1, i32 5, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -104,3 +105,4 @@ declare i32 @puts(i8* nocapture) nounwind
 !30 = metadata !{metadata !14, metadata !17}
 !31 = metadata !{metadata !"rem_small.c", metadata !"/private/tmp"}
 !32 = metadata !{i32 0}
+!33 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll b/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll
index 0f18f09..91cd208 100644
--- a/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll
+++ b/test/CodeGen/X86/2011-05-26-UnreachableBlockElim.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-apple-macosx10.6.0"
 
 @aux_temp = external global %struct.dfa, align 8
 
-declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) nounwind readnone
 
 declare void @__memset_chk() nounwind
 
@@ -21,12 +21,12 @@ if.end.i:                                         ; preds = %entry
   br i1 undef, label %land.end.thread.i, label %land.end.i
 
 land.end.thread.i:                                ; preds = %if.end.i
-  %0 = call i64 @llvm.objectsize.i64(i8* undef, i1 false) nounwind
+  %0 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false) nounwind
   %cmp1710.i = icmp eq i64 %0, -1
   br i1 %cmp1710.i, label %cond.false156.i, label %cond.true138.i
 
 land.end.i:                                       ; preds = %if.end.i
-  %1 = call i64 @llvm.objectsize.i64(i8* undef, i1 false) nounwind
+  %1 = call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false) nounwind
   %cmp17.i = icmp eq i64 %1, -1
   br i1 %cmp17.i, label %cond.false156.i, label %cond.true138.i
 
@@ -41,13 +41,8 @@ cond.false156.i:                                  ; preds = %for.end.i, %land.en
 
 cond.end166.i:                                    ; preds = %cond.false156.i, %cond.true138.i
   %idxprom1113.i = phi i64 [ %idxprom1114.i, %cond.false156.i ], [ undef, %cond.true138.i ]
-  %tmp235.i = load %struct.state** getelementptr inbounds (%struct.dfa* @aux_temp, i64 0, i32 2), align 8, !tbaa !0
+  %tmp235.i = load %struct.state** getelementptr inbounds (%struct.dfa* @aux_temp, i64 0, i32 2), align 8
   %att.i = getelementptr inbounds %struct.state* %tmp235.i, i64 %idxprom1113.i, i32 0
-  store i32 0, i32* %att.i, align 4, !tbaa !3
+  store i32 0, i32* %att.i, align 4
   ret void
 }
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"int", metadata !1}
diff --git a/test/CodeGen/X86/2011-06-03-x87chain.ll b/test/CodeGen/X86/2011-06-03-x87chain.ll
index ce63c74..5275b68 100644
--- a/test/CodeGen/X86/2011-06-03-x87chain.ll
+++ b/test/CodeGen/X86/2011-06-03-x87chain.ll
@@ -29,3 +29,21 @@ entry:
   store float %conv, float* %f, align 4
   ret float %conv
 }
+
+define void @PR17495() {
+entry:
+  br i1 undef, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %x.1.copyload = load i24* undef, align 1
+  %conv = sitofp i24 %x.1.copyload to float
+  %div = fmul float %conv, 0x3E80000000000000
+  store float %div, float* undef, align 4
+  br i1 false, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+
+; CHECK-LABEL: @PR17495
+; CHECK-NOT: fildll
+}
diff --git a/test/CodeGen/X86/2011-09-18-sse2cmp.ll b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
index a6f428f..89de648 100644
--- a/test/CodeGen/X86/2011-09-18-sse2cmp.ll
+++ b/test/CodeGen/X86/2011-09-18-sse2cmp.ll
@@ -1,4 +1,4 @@
-;RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse41 | FileCheck %s
+;RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
 
 ;CHECK: @max
 ;CHECK: cmplepd
diff --git a/test/CodeGen/X86/2011-09-21-setcc-bug.ll b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
index 4daf678..a67c3f3 100644
--- a/test/CodeGen/X86/2011-09-21-setcc-bug.ll
+++ b/test/CodeGen/X86/2011-09-21-setcc-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1
 
 ; Make sure we are not crashing on this code.
 
diff --git a/test/CodeGen/X86/2011-10-11-srl.ll b/test/CodeGen/X86/2011-10-11-srl.ll
index 6c6d340..434f88c 100644
--- a/test/CodeGen/X86/2011-10-11-srl.ll
+++ b/test/CodeGen/X86/2011-10-11-srl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=-sse41 
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=-sse4.1
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/X86/2011-10-12-MachineCSE.ll b/test/CodeGen/X86/2011-10-12-MachineCSE.ll
index cd15f84..72e672a 100644
--- a/test/CodeGen/X86/2011-10-12-MachineCSE.ll
+++ b/test/CodeGen/X86/2011-10-12-MachineCSE.ll
@@ -20,11 +20,11 @@ entry:
   %2 = lshr i32 %1, 16
   %bf.clear = and i32 %2, 255
   %idxprom = sext i32 %bf.clear to i64
-  %3 = load %struct.optab** getelementptr inbounds ([49 x %struct.optab*]* @optab_table, i32 0, i64 0), align 8, !tbaa !0
+  %3 = load %struct.optab** getelementptr inbounds ([49 x %struct.optab*]* @optab_table, i32 0, i64 0), align 8
   %handlers = getelementptr inbounds %struct.optab* %3, i32 0, i32 1
   %arrayidx = getelementptr inbounds [59 x %struct.anon.3]* %handlers, i32 0, i64 %idxprom
   %insn_code = getelementptr inbounds %struct.anon.3* %arrayidx, i32 0, i32 0
-  %4 = load i32* %insn_code, align 4, !tbaa !3
+  %4 = load i32* %insn_code, align 4
   %cmp = icmp eq i32 %4, 1317
   br i1 %cmp, label %if.then, label %lor.lhs.false
 
@@ -32,14 +32,14 @@ lor.lhs.false:                                    ; preds = %entry
   %idxprom1 = sext i32 %4 to i64
   %arrayidx2 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom1
   %operand = getelementptr inbounds %struct.insn_data* %arrayidx2, i32 0, i32 3
-  %5 = load %struct.insn_operand_data** %operand, align 8, !tbaa !0
+  %5 = load %struct.insn_operand_data** %operand, align 8
   %arrayidx3 = getelementptr inbounds %struct.insn_operand_data* %5, i64 0
   %predicate = getelementptr inbounds %struct.insn_operand_data* %arrayidx3, i32 0, i32 0
-  %6 = load i32 (%struct.rtx_def*, i32)** %predicate, align 8, !tbaa !0
+  %6 = load i32 (%struct.rtx_def*, i32)** %predicate, align 8
   %idxprom4 = sext i32 %4 to i64
   %arrayidx5 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom4
   %operand6 = getelementptr inbounds %struct.insn_data* %arrayidx5, i32 0, i32 3
-  %7 = load %struct.insn_operand_data** %operand6, align 8, !tbaa !0
+  %7 = load %struct.insn_operand_data** %operand6, align 8
   %arrayidx7 = getelementptr inbounds %struct.insn_operand_data* %7, i64 0
   %8 = bitcast %struct.insn_operand_data* %arrayidx7 to i8*
   %bf.field.offs = getelementptr i8* %8, i32 16
@@ -54,14 +54,14 @@ lor.lhs.false9:                                   ; preds = %lor.lhs.false
   %idxprom10 = sext i32 %4 to i64
   %arrayidx11 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom10
   %operand12 = getelementptr inbounds %struct.insn_data* %arrayidx11, i32 0, i32 3
-  %11 = load %struct.insn_operand_data** %operand12, align 8, !tbaa !0
+  %11 = load %struct.insn_operand_data** %operand12, align 8
   %arrayidx13 = getelementptr inbounds %struct.insn_operand_data* %11, i64 1
   %predicate14 = getelementptr inbounds %struct.insn_operand_data* %arrayidx13, i32 0, i32 0
-  %12 = load i32 (%struct.rtx_def*, i32)** %predicate14, align 8, !tbaa !0
+  %12 = load i32 (%struct.rtx_def*, i32)** %predicate14, align 8
   %idxprom15 = sext i32 %4 to i64
   %arrayidx16 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom15
   %operand17 = getelementptr inbounds %struct.insn_data* %arrayidx16, i32 0, i32 3
-  %13 = load %struct.insn_operand_data** %operand17, align 8, !tbaa !0
+  %13 = load %struct.insn_operand_data** %operand17, align 8
   %arrayidx18 = getelementptr inbounds %struct.insn_operand_data* %13, i64 1
   %14 = bitcast %struct.insn_operand_data* %arrayidx18 to i8*
   %bf.field.offs19 = getelementptr i8* %14, i32 16
@@ -76,14 +76,14 @@ lor.lhs.false23:                                  ; preds = %lor.lhs.false9
   %idxprom24 = sext i32 %4 to i64
   %arrayidx25 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom24
   %operand26 = getelementptr inbounds %struct.insn_data* %arrayidx25, i32 0, i32 3
-  %17 = load %struct.insn_operand_data** %operand26, align 8, !tbaa !0
+  %17 = load %struct.insn_operand_data** %operand26, align 8
   %arrayidx27 = getelementptr inbounds %struct.insn_operand_data* %17, i64 2
   %predicate28 = getelementptr inbounds %struct.insn_operand_data* %arrayidx27, i32 0, i32 0
-  %18 = load i32 (%struct.rtx_def*, i32)** %predicate28, align 8, !tbaa !0
+  %18 = load i32 (%struct.rtx_def*, i32)** %predicate28, align 8
   %idxprom29 = sext i32 %4 to i64
   %arrayidx30 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom29
   %operand31 = getelementptr inbounds %struct.insn_data* %arrayidx30, i32 0, i32 3
-  %19 = load %struct.insn_operand_data** %operand31, align 8, !tbaa !0
+  %19 = load %struct.insn_operand_data** %operand31, align 8
   %arrayidx32 = getelementptr inbounds %struct.insn_operand_data* %19, i64 2
   %20 = bitcast %struct.insn_operand_data* %arrayidx32 to i8*
   %bf.field.offs33 = getelementptr i8* %20, i32 16
@@ -101,7 +101,7 @@ if.end:                                           ; preds = %lor.lhs.false23
   %idxprom37 = sext i32 %4 to i64
   %arrayidx38 = getelementptr inbounds [0 x %struct.insn_data]* @insn_data, i32 0, i64 %idxprom37
   %genfun = getelementptr inbounds %struct.insn_data* %arrayidx38, i32 0, i32 2
-  %23 = load %struct.rtx_def* (%struct.rtx_def*, ...)** %genfun, align 8, !tbaa !0
+  %23 = load %struct.rtx_def* (%struct.rtx_def*, ...)** %genfun, align 8
   %call39 = tail call %struct.rtx_def* (%struct.rtx_def*, ...)* %23(%struct.rtx_def* %r0, %struct.rtx_def* %r1, %struct.rtx_def* %c)
   br label %return
 
@@ -109,8 +109,3 @@ return:                                           ; preds = %if.end, %if.then
   %24 = phi %struct.rtx_def* [ %call39, %if.end ], [ null, %if.then ]
   ret %struct.rtx_def* %24
 }
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
-!3 = metadata !{metadata !"_ZTS9insn_code", metadata !1}
diff --git a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
index da734d4..07a6910 100644
--- a/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
+++ b/test/CodeGen/X86/2011-10-19-LegelizeLoad.ll
@@ -16,8 +16,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: main
 define i32 @main() nounwind uwtable {
 entry:
-; CHECK: pmovsxbq  j(%rip), %
 ; CHECK: pmovsxbq  i(%rip), %
+; CHECK: pmovsxbq  j(%rip), %
   %0 = load <2 x i8>* @i, align 8
   %1 = load <2 x i8>* @j, align 8
   %div = sdiv <2 x i8> %1, %0
@@ -25,4 +25,3 @@ entry:
   ret i32 0
 ; CHECK: ret
 }
-
diff --git a/test/CodeGen/X86/2011-12-15-vec_shift.ll b/test/CodeGen/X86/2011-12-15-vec_shift.ll
index dc3a08b..0183e10 100644
--- a/test/CodeGen/X86/2011-12-15-vec_shift.ll
+++ b/test/CodeGen/X86/2011-12-15-vec_shift.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=x86-64 -mattr=+sse41 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
-; RUN: llc -march=x86-64 -mattr=-sse41 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
+; RUN: llc -march=x86-64 -mattr=+sse4.1 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-W-SSE4
+; RUN: llc -march=x86-64 -mattr=-sse4.1 -mcpu=penryn < %s | FileCheck %s -check-prefix=CHECK-WO-SSE4
 ; Test case for r146671
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7"
diff --git a/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll b/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
index 7515e80..14643e4 100644
--- a/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
+++ b/test/CodeGen/X86/2011-12-26-extractelement-duplicate-load.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -mattr=-sse42,+sse41 < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mattr=-sse4.2,+sse4.1 < %s | FileCheck %s
 ; Make sure we don't load from the location pointed to by %p
 ; twice: it has non-obvious performance implications, and
 ; the relevant transformation doesn't know how to update
diff --git a/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll b/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll
index a883d79..cd8a16f 100644
--- a/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll
+++ b/test/CodeGen/X86/2012-01-16-mfence-nosse-flags.ll
@@ -15,7 +15,7 @@ entry:
 
 ; CHECK: lock
 ; CHECK-NEXT: orl {{.*}}, (%esp)
-; CHECK-NEXT: cmpl $0
+; CHECK-NEXT: testl [[REG:%e[a-z]+]], [[REG]]
 
 if.then:                                          ; preds = %entry
   tail call void bitcast (void (...)* @foo to void ()*)() nounwind
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 186fafb..16706ae 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,8 +5,8 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK-LABEL: func:
-;CHECK: vxorps
-;CHECK: vinsertf128
+;CHECK: vpxor
+;CHECK: vinserti128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
 ;CHECK: vmulps
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index 503aab4..d41b432 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -6,7 +6,7 @@
 ;
 ; CHECK: %entry
 ; CHECK: DEBUG_VALUE: hg
-; CHECK: je
+; CHECK: j
 
 %struct.node.0.27 = type { i16, double, [3 x double], i32, i32 }
 %struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] }
@@ -36,6 +36,7 @@ return:                                           ; preds = %for.cond.preheader,
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12}
 
 !0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
 !1 = metadata !{metadata !2}
@@ -44,5 +45,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !4 = metadata !{i32 786689, null, metadata !"hg", metadata !5, i32 67109589, metadata !6, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [hg] [line 725]
 !5 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ]
 !6 = metadata !{i32 786454, metadata !11, null, metadata !"hgstruct", i32 492, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
-!7 = metadata !{i32 786451, metadata !11, null, metadata !"", i32 487, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, i32 0, i32 0} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [from ]
+!7 = metadata !{i32 786451, metadata !11, null, metadata !"", i32 487, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, i32 0, null} ; [ DW_TAG_structure_type ] [line 487, size 512, align 64, offset 0] [def] [from ]
 !11 = metadata !{metadata !"MultiSource/Benchmarks/Olden/bh/newbh.c", metadata !"MultiSource/Benchmarks/Olden/bh"}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 21e105d..7befa6b 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -63,6 +63,7 @@ if.else4114:                                      ; preds = %if.then4073
 declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!35}
 
 !0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
 !1 = metadata !{metadata !2}
@@ -79,7 +80,7 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 !12 = metadata !{i32 786443, metadata !13, i32 249, i32 0, metadata !14, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !13 = metadata !{i32 786443, metadata !3, i32 221, i32 0, metadata !14, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !14 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, i32 0} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
+!15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
 !16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
 !17 = metadata !{metadata !18}
 !18 = metadata !{i32 786465, i64 0, i64 20}       ; [ DW_TAG_subrange_type ] [0, 19]
@@ -134,3 +135,4 @@ declare void @_Znwm()
 !32 = metadata !{i32 786454, metadata !34, null, metadata !"HM", i32 28, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_typedef ] [HM] [line 28, size 0, align 0, offset 0] [from ]
 !33 = metadata !{i32 786473, metadata !34} ; [ DW_TAG_file_type ]
 !34 = metadata !{metadata !"SingleSource/Benchmarks/Shootout-C++/hash.cpp", metadata !"SingleSource/Benchmarks/Shootout-C++"}
+!35 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
index dcbe109..5aec3d9 100644
--- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll
@@ -34,11 +34,13 @@ invoke.cont44:                                    ; preds = %if.end
 }
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8}
 
 !0 = metadata !{i32 786449, metadata !6, i32 4, metadata !"clang version 3.3 (trunk 168984) (llvm/trunk 168983)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Bullet/MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp] [DW_LANG_C_plus_plus]
 !2 = metadata !{null}
 !3 = metadata !{i32 786688, null, metadata !"callback", null, i32 214, metadata !4, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [callback] [line 214]
-!4 = metadata !{i32 786451, metadata !6, null, metadata !"btCompoundLeafCallback", i32 90, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [from ]
+!4 = metadata !{i32 786451, metadata !6, null, metadata !"btCompoundLeafCallback", i32 90, i64 512, i64 64, i32 0, i32 0, null, null, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [btCompoundLeafCallback] [line 90, size 512, align 64, offset 0] [def] [from ]
 !5 = metadata !{i32 786473, metadata !6} ; [ DW_TAG_file_type ]
 !6 = metadata !{metadata !"MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", metadata !"MultiSource/Benchmarks/Bullet"}
 !7 = metadata !{i32 0}
+!8 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll b/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
index f0c7781..0ff9d39 100644
--- a/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
+++ b/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
@@ -23,6 +23,6 @@ entry:
 
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) #1
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
diff --git a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
new file mode 100644
index 0000000..3455b68
--- /dev/null
+++ b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
@@ -0,0 +1,132 @@
+; RUN: llc -mtriple x86_64-apple-darwin -O0 < %s -o - | FileCheck %s
+;
+; During X86 fastisel, the address of indirect call was resolved
+; through bitcast, ptrtoint, and inttoptr instructions. This is valid
+; only if the related instructions are in that same basic block, otherwise
+; we may reference variables that were not live accross basic blocks
+; resulting in undefined virtual registers.
+;
+; In this example, this is illustrated by a the spill/reload of the
+; LOADED_PTR_SLOT.
+;
+; Before this patch, the compiler was accessing two different spill
+; slots.
+; <rdar://problem/15192473>
+
+; CHECK-LABEL: @test_bitcast
+; Load the value of the function pointer: %loaded_ptr
+; CHECK: movq (%rdi), [[LOADED_PTR:%[a-z]+]]
+; Spill %arg2.
+; CHECK: movq %rdx, [[ARG2_SLOT:[0-9]*\(%[a-z]+\)]]
+; Spill %loaded_ptr.
+; CHECK: movq [[LOADED_PTR]], [[LOADED_PTR_SLOT:[0-9]*\(%[a-z]+\)]]
+; Perform the indirect call.
+; Load the first argument
+; CHECK: movq [[ARG2_SLOT]], %rdi
+; Load the second argument
+; CHECK: movq [[ARG2_SLOT]], %rsi
+; Load the thrid argument
+; CHECK: movq [[ARG2_SLOT]], %rdx
+; Load the function pointer.
+; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
+; Call.
+; CHECK: callq *[[FCT_PTR]]
+; CHECK: ret
+define i64 @test_bitcast(i64 (i64, i64, i64)** %arg, i1 %bool, i64 %arg2) {
+entry:
+  %loaded_ptr = load i64 (i64, i64, i64)** %arg, align 8
+  %raw = bitcast i64 (i64, i64, i64)* %loaded_ptr to i8*
+  switch i1 %bool, label %default [
+    i1 true, label %label_true
+    i1 false, label %label_end
+  ]
+default:
+  unreachable
+
+label_true:
+  br label %label_end
+
+label_end:
+  %fct_ptr = bitcast i8* %raw to i64 (i64, i64, i64)*
+  %res = call i64 %fct_ptr(i64 %arg2, i64 %arg2, i64 %arg2)
+  ret i64 %res
+}
+
+; CHECK-LABEL: @test_inttoptr
+; Load the value of the function pointer: %loaded_ptr
+; CHECK: movq (%rdi), [[LOADED_PTR:%[a-z]+]]
+; Spill %arg2.
+; CHECK: movq %rdx, [[ARG2_SLOT:[0-9]*\(%[a-z]+\)]]
+; Spill %loaded_ptr.
+; CHECK: movq [[LOADED_PTR]], [[LOADED_PTR_SLOT:[0-9]*\(%[a-z]+\)]]
+; Perform the indirect call.
+; Load the first argument
+; CHECK: movq [[ARG2_SLOT]], %rdi
+; Load the second argument
+; CHECK: movq [[ARG2_SLOT]], %rsi
+; Load the thrid argument
+; CHECK: movq [[ARG2_SLOT]], %rdx
+; Load the function pointer.
+; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
+; Call.
+; CHECK: callq *[[FCT_PTR]]
+; CHECK: ret
+define i64 @test_inttoptr(i64 (i64, i64, i64)** %arg, i1 %bool, i64 %arg2) {
+entry:
+  %loaded_ptr = load i64 (i64, i64, i64)** %arg, align 8
+  %raw = ptrtoint i64 (i64, i64, i64)* %loaded_ptr to i64
+  switch i1 %bool, label %default [
+    i1 true, label %label_true
+    i1 false, label %label_end
+  ]
+default:
+  unreachable
+
+label_true:
+  br label %label_end
+
+label_end:
+  %fct_ptr = inttoptr i64 %raw to i64 (i64, i64, i64)*
+  %res = call i64 %fct_ptr(i64 %arg2, i64 %arg2, i64 %arg2)
+  ret i64 %res
+}
+
+; CHECK-LABEL: @test_ptrtoint
+; Load the value of the function pointer: %loaded_ptr
+; CHECK: movq (%rdi), [[LOADED_PTR:%[a-z]+]]
+; Spill %arg2.
+; CHECK: movq %rdx, [[ARG2_SLOT:[0-9]*\(%[a-z]+\)]]
+; Spill %loaded_ptr.
+; CHECK: movq [[LOADED_PTR]], [[LOADED_PTR_SLOT:[0-9]*\(%[a-z]+\)]]
+; Perform the indirect call.
+; Load the first argument
+; CHECK: movq [[ARG2_SLOT]], %rdi
+; Load the second argument
+; CHECK: movq [[ARG2_SLOT]], %rsi
+; Load the thrid argument
+; CHECK: movq [[ARG2_SLOT]], %rdx
+; Load the function pointer.
+; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
+; Call.
+; CHECK: callq *[[FCT_PTR]]
+; CHECK: ret
+define i64 @test_ptrtoint(i64 (i64, i64, i64)** %arg, i1 %bool, i64 %arg2) {
+entry:
+  %loaded_ptr = load i64 (i64, i64, i64)** %arg, align 8
+  %raw = bitcast i64 (i64, i64, i64)* %loaded_ptr to i8*
+  switch i1 %bool, label %default [
+    i1 true, label %label_true
+    i1 false, label %label_end
+  ]
+default:
+  unreachable
+
+label_true:
+  br label %label_end
+
+label_end:
+  %fct_int = ptrtoint i8* %raw to i64
+  %fct_ptr = inttoptr i64 %fct_int to i64 (i64, i64, i64)*
+  %res = call i64 %fct_ptr(i64 %arg2, i64 %arg2, i64 %arg2)
+  ret i64 %res
+}
diff --git a/test/CodeGen/X86/3addr-16bit.ll b/test/CodeGen/X86/3addr-16bit.ll
index 77c3c16..fafdfdb 100644
--- a/test/CodeGen/X86/3addr-16bit.ll
+++ b/test/CodeGen/X86/3addr-16bit.ll
@@ -34,7 +34,8 @@ entry:
 
 ; 64BIT-LABEL:     t2:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     leal -1(%rsi), %eax
+; 64BIT:     decl %eax
+; 64BIT:     movzwl %ax
   %0 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   %1 = add i16 %k, -1                             ; <i16> [#uses=3]
   br i1 %0, label %bb, label %bb1
@@ -58,7 +59,7 @@ entry:
 
 ; 64BIT-LABEL:     t3:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     leal 2(%rsi), %eax
+; 64BIT:     addl $2, %eax
   %0 = add i16 %k, 2                              ; <i16> [#uses=3]
   %1 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   br i1 %1, label %bb, label %bb1
@@ -81,7 +82,7 @@ entry:
 
 ; 64BIT-LABEL:     t4:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     leal (%rsi,%rdi), %eax
+; 64BIT:     addl %edi, %eax
   %0 = add i16 %k, %c                             ; <i16> [#uses=3]
   %1 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   br i1 %1, label %bb, label %bb1
diff --git a/test/CodeGen/X86/GC/lit.local.cfg b/test/CodeGen/X86/GC/lit.local.cfg
index a8ad0f1..ba763cf 100644
--- a/test/CodeGen/X86/GC/lit.local.cfg
+++ b/test/CodeGen/X86/GC/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp']
-
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/X86/GC/ocaml-gc-assert.ll b/test/CodeGen/X86/GC/ocaml-gc-assert.ll
new file mode 100644
index 0000000..b32ceca
--- /dev/null
+++ b/test/CodeGen/X86/GC/ocaml-gc-assert.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; PR3168
+
+; CHECK-LABEL: append
+
+define i32* @append() gc "ocaml" {
+entry:
+  switch i32 0, label %L2 [i32 0, label %L1]
+L1:
+  %var8 = alloca i8*
+  call void @llvm.gcroot(i8** %var8,i8* null)
+  br label %L3
+L2:
+  call ccc void @oread_runtime_casenotcovered()
+  unreachable
+L3:
+  ret i32* null
+}
+
+declare ccc void @oread_runtime_casenotcovered()
+declare void @llvm.gcroot(i8**,i8*)
diff --git a/test/CodeGen/X86/GC/ocaml-gc.ll b/test/CodeGen/X86/GC/ocaml-gc.ll
index 44241a9..6d5f8ae 100644
--- a/test/CodeGen/X86/GC/ocaml-gc.ll
+++ b/test/CodeGen/X86/GC/ocaml-gc.ll
@@ -2,23 +2,23 @@
 
 define i32 @main(i32 %x) nounwind gc "ocaml" {
 ; CHECK:        .text
-; CHECK-NEXT:   .globl  caml_3C_stdin_3E___code_begin
-; CHECK-NEXT: caml_3C_stdin_3E___code_begin:
+; CHECK-NEXT:   .globl "caml<stdin>__code_begin"
+; CHECK-NEXT: "caml<stdin>__code_begin":
 ; CHECK-NEXT:   .data
-; CHECK-NEXT:   .globl  caml_3C_stdin_3E___data_begin
-; CHECK-NEXT: caml_3C_stdin_3E___data_begin:
+; CHECK-NEXT:   .globl  "caml<stdin>__data_begin"
+; CHECK-NEXT: "caml<stdin>__data_begin":
 
   %puts = tail call i32 @foo(i32 %x)
   ret i32 0
 
-; CHECK:        .globl  caml_3C_stdin_3E___code_end
-; CHECK-NEXT: caml_3C_stdin_3E___code_end:
+; CHECK:        .globl "caml<stdin>__code_end"
+; CHECK-NEXT: "caml<stdin>__code_end":
 ; CHECK-NEXT:   .data
-; CHECK-NEXT:   .globl  caml_3C_stdin_3E___data_end
-; CHECK-NEXT: caml_3C_stdin_3E___data_end:
+; CHECK-NEXT:   .globl "caml<stdin>__data_end"
+; CHECK-NEXT: "caml<stdin>__data_end":
 ; CHECK-NEXT:   .quad   0
-; CHECK-NEXT:   .globl  caml_3C_stdin_3E___frametable
-; CHECK-NEXT: caml_3C_stdin_3E___frametable:
+; CHECK-NEXT:   .globl "caml<stdin>__frametable"
+; CHECK-NEXT: "caml<stdin>__frametable":
 ; CHECK-NEXT:   .short  1
 ; CHECK-NEXT:   .align  8
 ; CHECK-NEXT:                # live roots for main
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index df9580c..584e644 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -26,11 +26,12 @@ bb2:
 declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!22}
 
 !0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"Apple clang version 3.0 (tags/Apple/clang-211.10.1) (based on LLVM 3.0svn)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i32*)* @foo, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ]
+!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 (i32, i32*)* @foo, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ] [line 2] [def] [scope 0] [foo]
 !2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !4 = metadata !{metadata !5}
 !5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !6 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777218, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
@@ -49,3 +50,4 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !19 = metadata !{metadata !6, metadata !7, metadata !10}
 !20 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
 !21 = metadata !{i32 0}
+!22 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index 8b67a44..51d0d17 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -25,6 +25,11 @@ declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
 
 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
 
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!23}
+!0 = metadata !{i32 524305, metadata !1, i32 1, metadata !"clang", i1 true, metadata !"", i32 0, metadata !2, metadata !2, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !"t.c", metadata !""}
 !16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6}
 !2 = metadata !{i32 0}
 !22 = metadata !{i32 786688, null, metadata !"x", metadata !2, i32 16, metadata !16, i32 0, i32 0}
+!23 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
index f1d9296..a8e3537 100644
--- a/test/CodeGen/X86/StackColoring.ll
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -4,8 +4,8 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-;YESCOLOR: subq  $136, %rsp
-;NOCOLOR: subq  $264, %rsp
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
 
 define i32 @myCall_w2(i32 %in) {
 entry:
diff --git a/test/CodeGen/X86/abi-isel.ll b/test/CodeGen/X86/abi-isel.ll
index 3b84231..633e70f 100644
--- a/test/CodeGen/X86/abi-isel.ll
+++ b/test/CodeGen/X86/abi-isel.ll
@@ -1,16 +1,16 @@
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-32-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-unknown-linux-gnu -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-32-PIC
 
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=LINUX-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=LINUX-64-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-unknown-linux-gnu -march=x86-64 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=LINUX-64-PIC
 
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=DARWIN-32-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-32-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-DYNAMIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=i686-apple-darwin -march=x86 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-32-PIC
 
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=static -code-model=small | FileCheck %s -check-prefix=DARWIN-64-STATIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
-; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=pic -code-model=small | FileCheck %s -check-prefix=DARWIN-64-PIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=static -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-STATIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=dynamic-no-pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-DYNAMIC
+; RUN: llc < %s -asm-verbose=0 -mcpu=generic -mtriple=x86_64-apple-darwin -march=x86-64 -relocation-model=pic -code-model=small -pre-RA-sched=list-ilp | FileCheck %s -check-prefix=DARWIN-64-PIC
 
 @src = external global [131072 x i32]
 @dst = external global [131072 x i32]
diff --git a/test/CodeGen/X86/add.ll b/test/CodeGen/X86/add.ll
index f36577b..62a62a4 100644
--- a/test/CodeGen/X86/add.ll
+++ b/test/CodeGen/X86/add.ll
@@ -9,7 +9,7 @@ define i32 @test1(i32 inreg %a) nounwind {
   %b = add i32 %a, 128
   ret i32 %b
 ; X32: subl	$-128, %eax
-; X64: subl $-128, 
+; X64: subl $-128,
 }
 define i64 @test2(i64 inreg %a) nounwind {
   %b = add i64 %a, 2147483648
@@ -20,7 +20,7 @@ define i64 @test2(i64 inreg %a) nounwind {
 define i64 @test3(i64 inreg %a) nounwind {
   %b = add i64 %a, 128
   ret i64 %b
-  
+
 ; X32: addl $128, %eax
 ; X64: subq	$-128,
 }
@@ -38,7 +38,7 @@ normal:
 
 overflow:
   ret i1 false
-  
+
 ; X32-LABEL: test4:
 ; X32: addl
 ; X32-NEXT: jo
@@ -82,11 +82,11 @@ define i64 @test6(i64 %A, i32 %B) nounwind {
         ret i64 %tmp5
 
 ; X32-LABEL: test6:
-; X32:	    movl 12(%esp), %edx
+; X32:	    movl 4(%esp), %eax
+; X32-NEXT: movl 12(%esp), %edx
 ; X32-NEXT: addl 8(%esp), %edx
-; X32-NEXT: movl 4(%esp), %eax
 ; X32-NEXT: ret
-        
+
 ; X64-LABEL: test6:
 ; X64:	shlq	$32, %r[[A1]]
 ; X64:	leaq	(%r[[A1]],%r[[A0]]), %rax
diff --git a/test/CodeGen/X86/aes_intrinsics.ll b/test/CodeGen/X86/aes_intrinsics.ll
new file mode 100644
index 0000000..fc1a2cc
--- /dev/null
+++ b/test/CodeGen/X86/aes_intrinsics.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+aes,-avx | FileCheck %s
+
+define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: aesdec
+  %res = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_aesni_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: aesdeclast
+  %res = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_aesni_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: aesenc
+  %res = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_aesni_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: aesenclast
+  %res = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_aesni_aesimc(<2 x i64> %a0) {
+  ; CHECK: aesimc
+  %res = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
+
+
+define <2 x i64> @test_x86_aesni_aeskeygenassist(<2 x i64> %a0) {
+  ; CHECK: aeskeygenassist
+  %res = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
diff --git a/test/CodeGen/X86/alias-error.ll b/test/CodeGen/X86/alias-error.ll
new file mode 100644
index 0000000..8f01dcf
--- /dev/null
+++ b/test/CodeGen/X86/alias-error.ll
@@ -0,0 +1,5 @@
+; RUN: not llc -mtriple=i686-pc-linux-gnu %s -o /dev/null 2>&1 | FileCheck %s
+
+@a = external global i32
+@b = alias i32* @a
+; CHECK: b: Target doesn't support aliases to declarations
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index f920279..d0a262d 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -1,26 +1,38 @@
-; RUN: llc < %s -mtriple=i686-pc-linux-gnu -asm-verbose=false -o %t
-; RUN: grep globl %t | count 6
-; RUN: grep weak %t  | count 1
-; RUN: grep hidden %t | count 1
-; RUN: grep protected %t | count 1
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -asm-verbose=false | FileCheck %s
 
-@bar = external global i32
+@bar = global i32 42
+
+; CHECK-DAG: .globl	foo1
 @foo1 = alias i32* @bar
+
+; CHECK-DAG: .globl	foo2
 @foo2 = alias i32* @bar
 
 %FunTy = type i32()
 
-declare i32 @foo_f()
+define i32 @foo_f() {
+  ret i32 0
+}
+; CHECK-DAG: .weak	bar_f
 @bar_f = alias weak %FunTy* @foo_f
 
+@bar_l = alias linkonce_odr i32* @bar
+; CHECK-DAG: .weak	bar_l
+
 @bar_i = alias internal i32* @bar
 
+; CHECK-DAG: .globl	A
 @A = alias bitcast (i32* @bar to i64*)
 
+; CHECK-DAG: .globl	bar_h
+; CHECK-DAG: .hidden	bar_h
 @bar_h = hidden alias i32* @bar
 
+; CHECK-DAG: .globl	bar_p
+; CHECK-DAG: .protected	bar_p
 @bar_p = protected alias i32* @bar
 
+; CHECK-DAG: .globl	test
 define i32 @test() {
 entry:
    %tmp = load i32* @foo1
diff --git a/test/CodeGen/X86/alloca-align-rounding.ll b/test/CodeGen/X86/alloca-align-rounding.ll
index 3d76fb0..74b9470 100644
--- a/test/CodeGen/X86/alloca-align-rounding.ll
+++ b/test/CodeGen/X86/alloca-align-rounding.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=i686-pc-linux -enable-misched=false | FileCheck %s
 
 declare void @bar(<2 x i64>* %n)
 
diff --git a/test/CodeGen/X86/anyregcc-crash.ll b/test/CodeGen/X86/anyregcc-crash.ll
new file mode 100644
index 0000000..cf6f6ed
--- /dev/null
+++ b/test/CodeGen/X86/anyregcc-crash.ll
@@ -0,0 +1,17 @@
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
+;
+; Check that misuse of anyregcc results in a compile time error.
+
+; CHECK: LLVM ERROR: ran out of registers during register allocation
+define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
+                        i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
+                        i64 %v13, i64 %v14, i64 %v15, i64 %v16) {
+entry:
+  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 16,
+                i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
+                i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
+                i64 %v13, i64 %v14, i64 %v15, i64 %v16)
+  ret i64 %result
+}
+
+declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/anyregcc.ll b/test/CodeGen/X86/anyregcc.ll
new file mode 100644
index 0000000..8109f87
--- /dev/null
+++ b/test/CodeGen/X86/anyregcc.ll
@@ -0,0 +1,348 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+; Stackmap Header: no constants - 6 callsites
+; CHECK-LABEL: .section	__LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .long   0
+; Num Constants
+; CHECK-NEXT:   .long   0
+; Num Callsites
+; CHECK-NEXT:   .long   8
+
+; test
+; CHECK-NEXT:   .long   0
+; CHECK-LABEL:  .long   L{{.*}}-_test
+; CHECK-NEXT:   .short  0
+; 3 locations
+; CHECK-NEXT:   .short  3
+; Loc 0: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Constant 3
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long 3
+define i64 @test() nounwind ssp uwtable {
+entry:
+  call anyregcc void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 0, i32 15, i8* null, i32 2, i32 1, i32 2, i64 3)
+  ret i64 0
+}
+
+; property access 1 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-NEXT:   .long   1
+; CHECK-LABEL:  .long   L{{.*}}-_property_access1
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 12297829382473034410 to i8*
+  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 1, i32 15, i8* %f, i32 1, i8* %obj)
+  ret i64 %ret
+}
+
+; property access 2 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-NEXT:   .long   2
+; CHECK-LABEL:  .long   L{{.*}}-_property_access2
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access2() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 12297829382473034410 to i8*
+  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 2, i32 15, i8* %f, i32 1, i64* %obj)
+  ret i64 %ret
+}
+
+; property access 3 - %obj is a frame index
+; CHECK-NEXT:   .long   3
+; CHECK-LABEL:  .long   L{{.*}}-_property_access3
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register <-- this will be folded once folding for FI is implemented
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access3() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 12297829382473034410 to i8*
+  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 3, i32 15, i8* %f, i32 0, i64* %obj)
+  ret i64 %ret
+}
+
+; anyreg_test1
+; CHECK-NEXT:   .long   4
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test1
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 12297829382473034410 to i8*
+  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 4, i32 15, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; anyreg_test2
+; CHECK-NEXT:   .long   5
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test2
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 12297829382473034410 to i8*
+  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; Test spilling the return value of an anyregcc call.
+;
+; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
+;
+; CHECK-LABEL: .long 12
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 3
+; Loc 0: Register (some register that will be spilled to the stack)
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register RDI
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 5
+; CHECK-NEXT: .long  0
+; Loc 1: Register RSI
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 4
+; CHECK-NEXT: .long  0
+define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
+  ret i64 %result
+}
+
+; Test spilling the arguments of an anyregcc call.
+;
+; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
+;
+; CHECK-LABEL: .long 13
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 5
+; Loc 0: Return a register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Arg0 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 2: Arg1 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 3: Arg2 spilled to RBP +
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 7
+; CHECK-NEXT: .long  {{[0-9]+}}
+; Loc 4: Arg3 spilled to RBP +
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 7
+; CHECK-NEXT: .long  {{[0-9]+}}
+define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
+  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 13, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/atom-call-reg-indirect.ll b/test/CodeGen/X86/atom-call-reg-indirect.ll
index 933b98b..48f2d4c 100644
--- a/test/CodeGen/X86/atom-call-reg-indirect.ll
+++ b/test/CodeGen/X86/atom-call-reg-indirect.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-linux | FileCheck -check-prefix=ATOM-NOT32 %s
 ; RUN: llc < %s -mcpu=atom -mtriple=x86_64-linux  | FileCheck -check-prefix=ATOM64 %s
 ; RUN: llc < %s -mcpu=core2 -mtriple=x86_64-linux | FileCheck -check-prefix=ATOM-NOT64 %s
+; RUN: llc < %s -mcpu=slm -mtriple=i686-linux  | FileCheck -check-prefix=SLM32 %s
+; RUN: llc < %s -mcpu=slm -mtriple=x86_64-linux  | FileCheck -check-prefix=SLM64 %s
 
 
 ; fn_ptr.ll
@@ -20,6 +22,10 @@ entry:
   ;ATOM64: movq (%rcx), %rcx
   ;ATOM64: callq *%rcx
   ;ATOM-NOT64: callq *(%rcx)
+  ;SLM32: movl (%ecx), %ecx
+  ;SLM32: calll *%ecx
+  ;SLM64: movq (%rcx), %rcx
+  ;SLM64: callq *%rcx
   tail call void %1(%class.A* %call)
   ret i32 0
 }
@@ -40,6 +46,10 @@ entry:
   ;ATOM64: movq (%rax), %rax
   ;ATOM64: callq *%rax
   ;ATOM-NOT64: callq *(%rax)
+  ;SLM32: movl (%eax), %eax
+  ;SLM32: calll *%eax
+  ;SLM64: movq (%rax), %rax
+  ;SLM64: callq *%rax
   tail call void %1(i32 2)
   ret i32 0
 }
diff --git a/test/CodeGen/X86/atom-lea-addw-bug.ll b/test/CodeGen/X86/atom-lea-addw-bug.ll
new file mode 100644
index 0000000..5cda2df
--- /dev/null
+++ b/test/CodeGen/X86/atom-lea-addw-bug.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mcpu=atom | FileCheck %s
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+target triple = "x86_64-apple-darwin12.5.0"
+
+define i32 @DoLayout() {
+entry:
+  %tmp1 = load i16* undef, align 2
+  %tmp17 = load i16* null, align 2
+  %tmp19 = load i16* undef, align 2
+  %shl = shl i16 %tmp19, 1
+  %add55 = add i16 %tmp17, %tmp1
+  %add57 = add i16 %add55, %shl
+  %conv60 = zext i16 %add57 to i32
+  %add61 = add nsw i32 %conv60, 0
+  %conv63 = and i32 %add61, 65535
+  ret i32 %conv63
+; CHECK: addw
+}
diff --git a/test/CodeGen/X86/atom-sched.ll b/test/CodeGen/X86/atom-sched.ll
index 0d97e85..fd18472 100644
--- a/test/CodeGen/X86/atom-sched.ll
+++ b/test/CodeGen/X86/atom-sched.ll
@@ -1,4 +1,5 @@
 ; RUN: llc <%s -O2 -mcpu=atom -march=x86 -relocation-model=static | FileCheck -check-prefix=atom %s
+; RUN: llc <%s -O2 -mcpu=slm -march=x86 -relocation-model=static | FileCheck -check-prefix=slm %s
 ; RUN: llc <%s -O2 -mcpu=core2 -march=x86 -relocation-model=static | FileCheck %s
 ;
 
@@ -13,6 +14,9 @@ define void @func() nounwind uwtable {
 ; atom: imull
 ; atom-NOT: movl
 ; atom: imull
+; slm: imull
+; slm-NOT: movl
+; slm: imull
 ; CHECK: imull
 ; CHECK: movl
 ; CHECK: imull
diff --git a/test/CodeGen/X86/atomic-dagsched.ll b/test/CodeGen/X86/atomic-dagsched.ll
index 05e630b..aa05757 100644
--- a/test/CodeGen/X86/atomic-dagsched.ll
+++ b/test/CodeGen/X86/atomic-dagsched.ll
@@ -34,8 +34,8 @@ dim_0_vector_pre_head.i:                          ; preds = %loop
 vector_kernel_entry.i:                            ; preds = %vector_kernel_entry.i, %dim_0_vector_pre_head.i
   %asr.iv9 = phi i8* [ %scevgep10, %vector_kernel_entry.i ], [ %asr.iv6, %dim_0_vector_pre_head.i ]
   %asr.iv = phi i64 [ %asr.iv.next, %vector_kernel_entry.i ], [ %vector.size.i, %dim_0_vector_pre_head.i ]
-  %8 = bitcast i8* %ptrtoarg4 to i32 addrspace(1)*
-  %asr.iv911 = bitcast i8* %asr.iv9 to <8 x i32> addrspace(1)*
+  %8 = addrspacecast i8* %ptrtoarg4 to i32 addrspace(1)*
+  %asr.iv911 = addrspacecast i8* %asr.iv9 to <8 x i32> addrspace(1)*
   %9 = load <8 x i32> addrspace(1)* %asr.iv911, align 4
   %extract8vector_func.i = extractelement <8 x i32> %9, i32 0
   %extract9vector_func.i = extractelement <8 x i32> %9, i32 1
@@ -73,8 +73,8 @@ dim_0_pre_head.i:                                 ; preds = %scalarIf.i
 
 scalar_kernel_entry.i:                            ; preds = %scalar_kernel_entry.i, %dim_0_pre_head.i
   %asr.iv12 = phi i64 [ %asr.iv.next13, %scalar_kernel_entry.i ], [ %22, %dim_0_pre_head.i ]
-  %23 = bitcast i8* %asr.iv6 to i32 addrspace(1)*
-  %24 = bitcast i8* %ptrtoarg4 to i32 addrspace(1)*
+  %23 = addrspacecast i8* %asr.iv6 to i32 addrspace(1)*
+  %24 = addrspacecast i8* %ptrtoarg4 to i32 addrspace(1)*
   %scevgep16 = getelementptr i32 addrspace(1)* %23, i64 %asr.iv12
   %25 = load i32 addrspace(1)* %scevgep16, align 4
   %26 = atomicrmw min i32 addrspace(1)* %24, i32 %25 seq_cst
diff --git a/test/CodeGen/X86/avx-arith.ll b/test/CodeGen/X86/avx-arith.ll
index 4aa3370..a9da1ec 100644
--- a/test/CodeGen/X86/avx-arith.ll
+++ b/test/CodeGen/X86/avx-arith.ll
@@ -240,15 +240,15 @@ define <16 x i16> @vpmullw(<16 x i16> %i, <16 x i16> %j) nounwind readnone {
 ; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsllq $32, %xmm
 ; CHECK-NEXT: vpaddq %xmm
-; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsrlq $32, %xmm
 ; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsllq $32, %xmm
+; CHECK-NEXT: vpaddq %xmm
+; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsrlq $32, %xmm
 ; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsllq $32, %xmm
 ; CHECK-NEXT: vpaddq %xmm
-; CHECK-NEXT: vpaddq %xmm
 ; CHECK-NEXT: vpsrlq $32, %xmm
 ; CHECK-NEXT: vpmuludq %xmm
 ; CHECK-NEXT: vpsllq $32, %xmm
@@ -269,4 +269,3 @@ define <4 x float> @int_sqrt_ss() {
  %x2 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %x1) nounwind
  ret <4 x float> %x2
 }
-
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 64c4627..1fd9085 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -122,10 +122,10 @@ define <16 x i16> @build_vec_16x16(i16 %a) nounwind readonly {
   ret <16 x i16> %res
 }
 
-;;; Check that VMOVPQIto64rr generates the assembly string "vmovd".  Previously
+;;; Check that VMOVPQIto64rr generates the assembly string "vmovq".  Previously
 ;;; an incorrect mnemonic of "movd" was printed for this instruction.
 ; CHECK: VMOVPQIto64rr
-; CHECK: vmovd
+; CHECK: vmovq
 define i64 @VMOVPQIto64rr(<2 x i64> %a) {
 entry:
   %vecext.i = extractelement <2 x i64> %a, i32 0
diff --git a/test/CodeGen/X86/avx-bitcast.ll b/test/CodeGen/X86/avx-bitcast.ll
index ecc71be..c9d828c 100644
--- a/test/CodeGen/X86/avx-bitcast.ll
+++ b/test/CodeGen/X86/avx-bitcast.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -O0 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
 ; CHECK: vmovsd (%
-; CHECK-NEXT: vmovd %xmm
+; CHECK-NEXT: vmovq %xmm
 define i64 @bitcasti64tof64() {
   %a = load double* undef
   %b = bitcast double %a to i64
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
index 0550720..7337815 100644
--- a/test/CodeGen/X86/avx-intel-ocl.ll
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -32,7 +32,7 @@ declare i32 @func_int(i32, i32)
 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
   %y = alloca <16 x float>, align 16
   %x = fadd <16 x float> %a, %b
-  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
   %2 = load <16 x float>* %y, align 16
   %3 = fadd <16 x float> %2, %1
   ret <16 x float> %3
@@ -43,21 +43,21 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
 ; preserved ymm6-ymm15
 ; WIN64: testf16_regs
 ; WIN64: call
-; WIN64: vaddps  {{%ymm[6-7]}}, %ymm0, %ymm0
-; WIN64: vaddps  {{%ymm[6-7]}}, %ymm1, %ymm1
+; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
+; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
 ; WIN64: ret
 
 ; preserved ymm8-ymm15
 ; X64: testf16_regs
 ; X64: call
-; X64: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
-; X64: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
+; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
+; X64: vaddps  {{%ymm[8-9]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
 ; X64: ret
 
 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
   %y = alloca <16 x float>, align 16
   %x = fadd <16 x float> %a, %b
-  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
   %2 = load <16 x float>* %y, align 16
   %3 = fadd <16 x float> %1, %b
   %4 = fadd <16 x float> %2, %3
@@ -166,4 +166,3 @@ entry:
   %8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %8
 }
-
diff --git a/test/CodeGen/X86/avx-sext.ll b/test/CodeGen/X86/avx-sext.ll
index b9c7000..fb2287f 100644
--- a/test/CodeGen/X86/avx-sext.ll
+++ b/test/CodeGen/X86/avx-sext.ll
@@ -154,6 +154,17 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
   ret <4 x i64> %extmask
 }
 
+; AVX-LABEL: sext_16i8_to_16i16
+; AVX: vpmovsxbw
+; AVX: vmovhlps
+; AVX: vpmovsxbw
+; AVX: ret
+define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
+ %X = load <16 x i8>* %ptr
+ %Y = sext <16 x i8> %X to <16 x i16>
+ ret <16 x i16> %Y
+}
+
 ; AVX: sext_4i8_to_4i64
 ; AVX: vpslld  $24
 ; AVX: vpsrad  $24
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index a625601..0956361 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -81,7 +81,7 @@ entry:
 define i32 @test9(<4 x i32> %a) nounwind {
 ; CHECK: test9
 ; CHECK: vpextrd
-  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4> 
+  %b = shufflevector <4 x i32> %a, <4 x i32> undef, <8 x i32> <i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 undef, i32 4>
   %r = extractelement <8 x i32> %b, i32 2
 ; CHECK: ret
   ret i32 %r
@@ -251,8 +251,8 @@ define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
 ; CHECK: swap8doubles
 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
 ; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
-; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
-; CHECK: vmovups {{[0-9]*}}(%rdi), %xmm{{[0-9]+}}
+; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
+; CHECK: vinsertf128 $1, {{[0-9]*}}(%rdi), %ymm{{[0-9]+}}
 ; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
 ; CHECK: vmovaps {{[0-9]*}}(%rsi), %ymm{{[0-9]+}}
 ; CHECK: vmovaps %xmm{{[0-9]+}}, {{[0-9]*}}(%rdi)
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 5c01c2c..5d07815 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -20,7 +20,7 @@ entry:
   ret <16 x i16> %shuffle
 }
 
-; CHECK: vmovd
+; CHECK: vmovq
 ; CHECK-NEXT: vmovlhps %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index d007736..58d0a35 100644
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -12,4 +12,9 @@ define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
   %B = trunc <8 x i32> %A to <8 x i16>
   ret <8 x i16>%B
 }
-
+define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{
+; CHECK-LABEL: trunc_16_8
+; CHECK: pshufb
+  %B = trunc <16 x i16> %A to <16 x i8>
+  ret <16 x i8> %B
+}
diff --git a/test/CodeGen/X86/avx-zext.ll b/test/CodeGen/X86/avx-zext.ll
index e2b6c55..7511746 100644
--- a/test/CodeGen/X86/avx-zext.ll
+++ b/test/CodeGen/X86/avx-zext.ll
@@ -27,3 +27,15 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
   %t = zext <8 x i8> %z to <8 x i32>
   ret <8 x i32> %t
 }
+
+; PR17654
+define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %z) {
+; CHECK-LABEL: zext_16i8_to_16i16:
+; CHECK: vpxor
+; CHECK: vpunpckhbw
+; CHECK: vpunpcklbw
+; CHECK: vinsertf128
+; CHECK: ret
+  %t = zext <16 x i8> %z to <16 x i16>
+  ret <16 x i16> %t
+}
diff --git a/test/CodeGen/X86/avx2-arith.ll b/test/CodeGen/X86/avx2-arith.ll
index 997fa19..72bdd9d 100644
--- a/test/CodeGen/X86/avx2-arith.ll
+++ b/test/CodeGen/X86/avx2-arith.ll
@@ -148,3 +148,21 @@ define <8 x i32> @mul_const9(<8 x i32> %x) {
   %y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
   ret <8 x i32> %y
 }
+
+; CHECK: mul_const10
+; CHECK: vpmulld
+; CHECK: ret
+define <4 x i32> @mul_const10(<4 x i32> %x) {
+  ; %x * 0x01010101
+  %m = mul <4 x i32> %x, <i32 16843009, i32 16843009, i32 16843009, i32 16843009>
+  ret <4 x i32> %m
+}
+
+; CHECK: mul_const11
+; CHECK: vpmulld
+; CHECK: ret
+define <4 x i32> @mul_const11(<4 x i32> %x) {
+  ; %x * 0x80808080
+  %m = mul <4 x i32> %x, <i32 2155905152, i32 2155905152, i32 2155905152, i32 2155905152>
+  ret <4 x i32> %m
+}
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
index 3ce08dc..f49718e 100644
--- a/test/CodeGen/X86/avx2-conversions.ll
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -63,6 +63,34 @@ define <8 x i32> @zext_8i8_8i32(<8 x i8> %A) nounwind {
   ret <8 x i32>%B
 }
 
+; CHECK-LABEL: zext_16i8_16i16:
+; CHECK: vpmovzxbw
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <16 x i16> @zext_16i8_16i16(<16 x i8> %z) {
+  %t = zext <16 x i8> %z to <16 x i16>
+  ret <16 x i16> %t
+}
+
+; CHECK-LABEL: sext_16i8_16i16:
+; CHECK: vpmovsxbw
+; CHECK-NOT: vinsert
+; CHECK: ret
+define <16 x i16> @sext_16i8_16i16(<16 x i8> %z) {
+  %t = sext <16 x i8> %z to <16 x i16>
+  ret <16 x i16> %t
+}
+
+; CHECK-LABEL: trunc_16i16_16i8:
+; CHECK: vpshufb
+; CHECK: vpshufb
+; CHECK: vpor
+; CHECK: ret
+define <16 x i8> @trunc_16i16_16i8(<16 x i16> %z) {
+  %t = trunc <16 x i16> %z to <16 x i8>
+  ret <16 x i8> %t
+}
+
 ; CHECK: load_sext_test1
 ; CHECK: vpmovsxdq (%r{{[^,]*}}), %ymm{{.*}}
 ; CHECK: ret 
diff --git a/test/CodeGen/X86/avx2-palignr.ll b/test/CodeGen/X86/avx2-palignr.ll
index 176e02c..83573dc 100644
--- a/test/CodeGen/X86/avx2-palignr.ll
+++ b/test/CodeGen/X86/avx2-palignr.ll
@@ -51,7 +51,7 @@ define <16 x i16> @test7(<16 x i16> %A, <16 x i16> %B) nounwind {
 
 define <32 x i8> @test8(<32 x i8> %A, <32 x i8> %B) nounwind {
 ; CHECK-LABEL: test8:
-; CHECK: palignr $5
+; CHECK: vpalignr $5
   %C = shufflevector <32 x i8> %A, <32 x i8> %B, <32 x i32> <i32 5, i32 6, i32 7, i32 undef, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52>
   ret <32 x i8> %C
 }
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index a978d93..5592e6c 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -121,7 +121,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_sraw_3:
-; CHECK: vpsraw  $16, %ymm0, %ymm0
+; CHECK: vpsraw  $15, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
@@ -151,7 +151,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srad_3:
-; CHECK: vpsrad  $32, %ymm0, %ymm0
+; CHECK: vpsrad  $31, %ymm0, %ymm0
 ; CHECK: ret
 
 ; SSE Logical Shift Right
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
new file mode 100644
index 0000000..e27600e
--- /dev/null
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -0,0 +1,271 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: addpd512
+; CHECK: vaddpd
+; CHECK: ret
+define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
+entry:
+  %add.i = fadd <8 x double> %x, %y
+  ret <8 x double> %add.i
+}
+
+; CHECK-LABEL: addpd512fold
+; CHECK: vaddpd LCP{{.*}}(%rip)
+; CHECK: ret
+define <8 x double> @addpd512fold(<8 x double> %y) {
+entry:
+  %add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00>
+  ret <8 x double> %add.i
+}
+
+; CHECK-LABEL: addps512
+; CHECK: vaddps
+; CHECK: ret
+define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) {
+entry:
+  %add.i = fadd <16 x float> %x, %y
+  ret <16 x float> %add.i
+}
+
+; CHECK-LABEL: addps512fold
+; CHECK: vaddps LCP{{.*}}(%rip)
+; CHECK: ret
+define <16 x float> @addps512fold(<16 x float> %y) {
+entry:
+  %add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000,  float 0x4002666660000000, float 0x3FF3333340000000>
+  ret <16 x float> %add.i
+}
+
+; CHECK-LABEL: subpd512
+; CHECK: vsubpd
+; CHECK: ret
+define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) {
+entry:
+  %sub.i = fsub <8 x double> %x, %y
+  ret <8 x double> %sub.i
+}
+
+; CHECK-LABEL: @subpd512fold
+; CHECK: vsubpd (%
+; CHECK: ret
+define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) {
+entry:
+  %tmp2 = load <8 x double>* %x, align 8
+  %sub.i = fsub <8 x double> %y, %tmp2
+  ret <8 x double> %sub.i
+}
+
+; CHECK-LABEL: @subps512
+; CHECK: vsubps
+; CHECK: ret
+define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) {
+entry:
+  %sub.i = fsub <16 x float> %x, %y
+  ret <16 x float> %sub.i
+}
+
+; CHECK-LABEL: subps512fold
+; CHECK: vsubps (%
+; CHECK: ret
+define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) {
+entry:
+  %tmp2 = load <16 x float>* %x, align 4
+  %sub.i = fsub <16 x float> %y, %tmp2
+  ret <16 x float> %sub.i
+}
+
+; CHECK-LABEL: imulq512
+; CHECK: vpmuludq
+; CHECK: vpmuludq
+; CHECK: ret
+define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
+  %z = mul <8 x i64>%x, %y
+  ret <8 x i64>%z
+}
+
+; CHECK-LABEL: mulpd512
+; CHECK: vmulpd
+; CHECK: ret
+define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
+entry:
+  %mul.i = fmul <8 x double> %x, %y
+  ret <8 x double> %mul.i
+}
+
+; CHECK-LABEL: mulpd512fold
+; CHECK: vmulpd LCP{{.*}}(%rip)
+; CHECK: ret
+define <8 x double> @mulpd512fold(<8 x double> %y) {
+entry:
+  %mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
+  ret <8 x double> %mul.i
+}
+
+; CHECK-LABEL: mulps512
+; CHECK: vmulps
+; CHECK: ret
+define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) {
+entry:
+  %mul.i = fmul <16 x float> %x, %y
+  ret <16 x float> %mul.i
+}
+
+; CHECK-LABEL: mulps512fold
+; CHECK: vmulps LCP{{.*}}(%rip)
+; CHECK: ret
+define <16 x float> @mulps512fold(<16 x float> %y) {
+entry:
+  %mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
+  ret <16 x float> %mul.i
+}
+
+; CHECK-LABEL: divpd512
+; CHECK: vdivpd
+; CHECK: ret
+define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) {
+entry:
+  %div.i = fdiv <8 x double> %x, %y
+  ret <8 x double> %div.i
+}
+
+; CHECK-LABEL: divpd512fold
+; CHECK: vdivpd LCP{{.*}}(%rip)
+; CHECK: ret
+define <8 x double> @divpd512fold(<8 x double> %y) {
+entry:
+  %div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
+  ret <8 x double> %div.i
+}
+
+; CHECK-LABEL: divps512
+; CHECK: vdivps
+; CHECK: ret
+define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) {
+entry:
+  %div.i = fdiv <16 x float> %x, %y
+  ret <16 x float> %div.i
+}
+
+; CHECK-LABEL: divps512fold
+; CHECK: vdivps LCP{{.*}}(%rip)
+; CHECK: ret
+define <16 x float> @divps512fold(<16 x float> %y) {
+entry:
+  %div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000>
+  ret <16 x float> %div.i
+}
+
+; CHECK-LABEL: vpaddq_test
+; CHECK: vpaddq %zmm
+; CHECK: ret
+define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+  %x = add <8 x i64> %i, %j
+  ret <8 x i64> %x
+}
+
+; CHECK-LABEL: vpaddd_test
+; CHECK: vpaddd %zmm
+; CHECK: ret
+define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+  %x = add <16 x i32> %i, %j
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: vpsubq_test
+; CHECK: vpsubq %zmm
+; CHECK: ret
+define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+  %x = sub <8 x i64> %i, %j
+  ret <8 x i64> %x
+}
+
+; CHECK-LABEL: vpsubd_test
+; CHECK: vpsubd
+; CHECK: ret
+define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+  %x = sub <16 x i32> %i, %j
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: vpmulld_test
+; CHECK: vpmulld %zmm
+; CHECK: ret
+define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
+  %x = mul <16 x i32> %i, %j
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: sqrtA
+; CHECK: vsqrtssz
+; CHECK: ret
+declare float @sqrtf(float) readnone
+define float @sqrtA(float %a) nounwind uwtable readnone ssp {
+entry:
+  %conv1 = tail call float @sqrtf(float %a) nounwind readnone
+  ret float %conv1
+}
+
+; CHECK-LABEL: sqrtB
+; CHECK: vsqrtsdz
+; CHECK: ret
+declare double @sqrt(double) readnone
+define double @sqrtB(double %a) nounwind uwtable readnone ssp {
+entry:
+  %call = tail call double @sqrt(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK-LABEL: sqrtC
+; CHECK: vsqrtssz
+; CHECK: ret
+declare float @llvm.sqrt.f32(float)
+define float @sqrtC(float %a) nounwind {
+  %b = call float @llvm.sqrt.f32(float %a)
+  ret float %b
+}
+
+; CHECK-LABEL: fadd_broadcast
+; CHECK: LCP{{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK: ret
+define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
+  %b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: addq_broadcast
+; CHECK: vpaddq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK: ret
+define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
+  %b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  ret <8 x i64> %b
+}
+
+; CHECK-LABEL: orq_broadcast
+; CHECK: vporq LCP{{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK: ret
+define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
+  %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  ret <8 x i64> %b
+}
+
+; CHECK-LABEL: andd512fold
+; CHECK: vpandd (%
+; CHECK: ret
+define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
+entry:
+  %a = load <16 x i32>* %x, align 4
+  %b = and <16 x i32> %y, %a
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: andqbrst
+; CHECK: vpandq  (%rdi){1to8}, %zmm
+; CHECK: ret
+define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
+entry:
+  %a = load i64* %ap, align 8
+  %b = insertelement <8 x i64> undef, i64 %a, i32 0
+  %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %d = and <8 x i64> %p1, %c
+  ret <8 x i64>%d
+}
diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
new file mode 100644
index 0000000..bc4560b
--- /dev/null
+++ b/test/CodeGen/X86/avx512-build-vector.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vpxord
+; CHECK: ret
+define <16 x i32> @test1(i32* %x) {
+   %y = load i32* %x, align 4
+   %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4
+   ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test2
+; CHECK: vpaddd LCP{{.*}}(%rip){1to16}
+; CHECK: ret
+define <16 x i32> @test2(<16 x i32> %x) {
+   %res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x
+   ret <16 x i32>%res
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
new file mode 100644
index 0000000..ba52745
--- /dev/null
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK: vucomisdz
+define double @test1(double %a, double %b) nounwind {
+  %tobool = fcmp une double %a, %b
+  br i1 %tobool, label %l1, label %l2
+
+l1:
+  %c = fsub double %a, %b
+  ret double %c
+l2:
+  %c1 = fadd double %a, %b
+  ret double %c1
+}
+
+; CHECK: vucomissz
+define float @test2(float %a, float %b) nounwind {
+  %tobool = fcmp olt float %a, %b
+  br i1 %tobool, label %l1, label %l2
+
+l1:
+  %c = fsub float %a, %b
+  ret float %c
+l2:
+  %c1 = fadd float %a, %b
+  ret float %c1
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
new file mode 100644
index 0000000..ed68ff7
--- /dev/null
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -0,0 +1,217 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: sitof32
+; CHECK: vcvtdq2ps %zmm
+; CHECK: ret
+define <16 x float> @sitof32(<16 x i32> %a) nounwind {
+  %b = sitofp <16 x i32> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: fptosi00
+; CHECK: vcvttps2dq %zmm
+; CHECK: ret
+define <16 x i32> @fptosi00(<16 x float> %a) nounwind {
+  %b = fptosi <16 x float> %a to <16 x i32>
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: fptoui00
+; CHECK: vcvttps2udq
+; CHECK: ret
+define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
+  %b = fptoui <16 x float> %a to <16 x i32>
+  ret <16 x i32> %b
+}
+
+; CHECK-LABEL: fptoui01
+; CHECK: vcvttpd2udq
+; CHECK: ret
+define <8 x i32> @fptoui01(<8 x double> %a) nounwind {
+  %b = fptoui <8 x double> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: sitof64
+; CHECK: vcvtdq2pd %ymm
+; CHECK: ret
+define <8 x double> @sitof64(<8 x i32> %a) {
+  %b = sitofp <8 x i32> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+; CHECK-LABEL: fptosi01
+; CHECK: vcvttpd2dq %zmm
+; CHECK: ret
+define <8 x i32> @fptosi01(<8 x double> %a) {
+  %b = fptosi <8 x double> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: fptrunc00
+; CHECK: vcvtpd2ps %zmm
+; CHECK-NEXT: vcvtpd2ps %zmm
+; CHECK-NEXT: vinsertf64x4    $1
+; CHECK: ret
+define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
+  %a = fptrunc <16 x double> %b to <16 x float>
+  ret <16 x float> %a
+}
+
+; CHECK-LABEL: fpext00
+; CHECK: vcvtps2pd %ymm0, %zmm0
+; CHECK: ret
+define <8 x double> @fpext00(<8 x float> %b) nounwind {
+  %a = fpext <8 x float> %b to <8 x double>
+  ret <8 x double> %a
+}
+
+; CHECK-LABEL: funcA
+; CHECK: vcvtsi2sdqz (%
+; CHECK: ret
+define double @funcA(i64* nocapture %e) {
+entry:
+  %tmp1 = load i64* %e, align 8
+  %conv = sitofp i64 %tmp1 to double
+  ret double %conv
+}
+
+; CHECK-LABEL: funcB
+; CHECK: vcvtsi2sdlz (%
+; CHECK: ret
+define double @funcB(i32* %e) {
+entry:
+  %tmp1 = load i32* %e, align 4
+  %conv = sitofp i32 %tmp1 to double
+  ret double %conv
+}
+
+; CHECK-LABEL: funcC
+; CHECK: vcvtsi2sslz (%
+; CHECK: ret
+define float @funcC(i32* %e) {
+entry:
+  %tmp1 = load i32* %e, align 4
+  %conv = sitofp i32 %tmp1 to float
+  ret float %conv
+}
+
+; CHECK-LABEL: i64tof32
+; CHECK: vcvtsi2ssqz  (%
+; CHECK: ret
+define float @i64tof32(i64* %e) {
+entry:
+  %tmp1 = load i64* %e, align 8
+  %conv = sitofp i64 %tmp1 to float
+  ret float %conv
+}
+
+; CHECK-LABEL: fpext
+; CHECK: vcvtss2sdz
+; CHECK: ret
+define void @fpext() {
+entry:
+  %f = alloca float, align 4
+  %d = alloca double, align 8
+  %tmp = load float* %f, align 4
+  %conv = fpext float %tmp to double
+  store double %conv, double* %d, align 8
+  ret void
+}
+
+; CHECK-LABEL: fpround_scalar
+; CHECK: vmovsdz
+; CHECK: vcvtsd2ssz
+; CHECK: vmovssz
+; CHECK: ret
+define void @fpround_scalar() nounwind uwtable {
+entry:
+  %f = alloca float, align 4
+  %d = alloca double, align 8
+  %tmp = load double* %d, align 8
+  %conv = fptrunc double %tmp to float
+  store float %conv, float* %f, align 4
+  ret void
+}
+
+; CHECK-LABEL: long_to_double
+; CHECK: vmovqz
+; CHECK: ret
+define double @long_to_double(i64 %x) {
+   %res = bitcast i64 %x to double
+   ret double %res
+}
+
+; CHECK-LABEL: double_to_long
+; CHECK: vmovqz
+; CHECK: ret
+define i64 @double_to_long(double %x) {
+   %res = bitcast double %x to i64
+   ret i64 %res
+}
+
+; CHECK-LABEL: int_to_float
+; CHECK: vmovdz
+; CHECK: ret
+define float @int_to_float(i32 %x) {
+   %res = bitcast i32 %x to float
+   ret float %res
+}
+
+; CHECK-LABEL: float_to_int
+; CHECK: vmovdz
+; CHECK: ret
+define i32 @float_to_int(float %x) {
+   %res = bitcast float %x to i32
+   ret i32 %res
+}
+
+; CHECK-LABEL: uitof64
+; CHECK: vcvtudq2pd
+; CHECK: vextracti64x4
+; CHECK: vcvtudq2pd
+; CHECK: ret
+define <16 x double> @uitof64(<16 x i32> %a) nounwind {
+  %b = uitofp <16 x i32> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+; CHECK-LABEL: uitof32
+; CHECK: vcvtudq2ps
+; CHECK: ret
+define <16 x float> @uitof32(<16 x i32> %a) nounwind {
+  %b = uitofp <16 x i32> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: @fptosi02
+; CHECK vcvttss2siz
+; CHECK: ret
+define i32 @fptosi02(float %a) nounwind {
+  %b = fptosi float %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: @fptoui02
+; CHECK vcvttss2usiz
+; CHECK: ret
+define i32 @fptoui02(float %a) nounwind {
+  %b = fptoui float %a to i32
+  ret i32 %b
+}
+
+; CHECK-LABEL: @uitofp02
+; CHECK vcvtusi2ss
+; CHECK: ret
+define float @uitofp02(i32 %a) nounwind {
+  %b = uitofp i32 %a to float
+  ret float %b
+}
+
+; CHECK-LABEL: @uitofp03
+; CHECK vcvtusi2sd
+; CHECK: ret
+define double @uitofp03(i32 %a) nounwind {
+  %b = uitofp i32 %a to double
+  ret double %b
+}
diff --git a/test/CodeGen/X86/avx512-fma-intrinsics.ll b/test/CodeGen/X86/avx512-fma-intrinsics.ll
new file mode 100644
index 0000000..ce3d759
--- /dev/null
+++ b/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -0,0 +1,97 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+define <16 x float> @test_x86_vfmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfmadd_ps_z
+  ; CHECK: vfmadd213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfmadd_pd_z
+  ; CHECK: vfmadd213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vfmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfmsubps_z
+  ; CHECK: vfmsub213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfmsubpd_z
+  ; CHECK: vfmsub213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfnmadd_ps_z
+  ; CHECK: vfnmadd213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfnmadd_pd_z
+  ; CHECK: vfnmadd213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfnmsubps_z
+  ; CHECK: vfnmsub213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfnmsubpd_z
+  ; CHECK: vfnmsub213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfmaddsubps_z
+  ; CHECK: vfmaddsub213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfmaddsubpd_z
+  ; CHECK: vfmaddsub213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vfmsubaddps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK-LABEL: test_x86_vfmsubaddps_z
+  ; CHECK: vfmsubadd213ps %zmm
+  %res = call <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) nounwind
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.fma.vfmsubadd.ps.512(<16 x float>, <16 x float>, <16 x float>) nounwind readnone
+
+define <8 x double> @test_x86_vfmsubaddpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK-LABEL: test_x86_vfmsubaddpd_z
+  ; CHECK: vfmsubadd213pd %zmm
+  %res = call <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) nounwind
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.fma.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/avx512-fma.ll b/test/CodeGen/X86/avx512-fma.ll
new file mode 100644
index 0000000..d6926e2
--- /dev/null
+++ b/test/CodeGen/X86/avx512-fma.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -fp-contract=fast | FileCheck %s
+
+; CHECK-LABEL: test_x86_fmadd_ps_z
+; CHECK: vfmadd213ps     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  %x = fmul <16 x float> %a0, %a1
+  %res = fadd <16 x float> %x, %a2
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: test_x86_fmsub_ps_z
+; CHECK: vfmsub213ps     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  %x = fmul <16 x float> %a0, %a1
+  %res = fsub <16 x float> %x, %a2
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: test_x86_fnmadd_ps_z
+; CHECK: vfnmadd213ps     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  %x = fmul <16 x float> %a0, %a1
+  %res = fsub <16 x float> %a2, %x
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: test_x86_fnmsub_ps_z
+; CHECK: vfnmsub213ps     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+  %x = fmul <16 x float> %a0, %a1
+  %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, 
+                          float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+						  float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, 
+						  float -0.000000e+00>, %x
+  %res = fsub <16 x float> %y, %a2
+  ret <16 x float> %res
+}
+
+; CHECK-LABEL: test_x86_fmadd_pd_z
+; CHECK: vfmadd213pd     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  %x = fmul <8 x double> %a0, %a1
+  %res = fadd <8 x double> %x, %a2
+  ret <8 x double> %res
+}
+
+; CHECK-LABEL: test_x86_fmsub_pd_z
+; CHECK: vfmsub213pd     %zmm2, %zmm1, %zmm0
+; CHECK: ret
+define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+  %x = fmul <8 x double> %a0, %a1
+  %res = fsub <8 x double> %x, %a2
+  ret <8 x double> %res
+}
+
+define double @test_x86_fmsub_sd_z(double %a0, double %a1, double %a2) {
+  %x = fmul double %a0, %a1
+  %res = fsub double %x, %a2
+  ret double %res
+}
+
+;CHECK-LABEL: test132_br
+;CHECK: vfmadd132ps  LCP{{.*}}(%rip){1to16}
+;CHECK: ret
+define <16 x float> @test132_br(<16 x float> %a1, <16 x float> %a2) nounwind {
+  %b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  %b2 = fadd <16 x float> %b1, %a2
+  ret <16 x float> %b2
+}
+
+;CHECK-LABEL: test213_br
+;CHECK: vfmadd213ps  LCP{{.*}}(%rip){1to16}
+;CHECK: ret
+define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
+  %b1 = fmul <16 x float> %a1, %a2
+  %b2 = fadd <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  ret <16 x float> %b2
+}
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
new file mode 100644
index 0000000..0321e95
--- /dev/null
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -0,0 +1,225 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+declare <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float>, i16, <16 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dps.mask.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double>, i8, <8 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dpd.mask.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
+
+declare <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float>, i8, <8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qps.mask.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double>, i8, <8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
+
+;CHECK-LABEL: gather_mask_dps
+;CHECK: kmovw
+;CHECK: vgatherdps
+;CHECK: vpadd
+;CHECK: vscatterdps
+;CHECK: ret
+define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_dpd
+;CHECK: kmovw
+;CHECK: vgatherdpd
+;CHECK: vpadd
+;CHECK: vscatterdpd
+;CHECK: ret
+define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_qps
+;CHECK: kmovw
+;CHECK: vgatherqps
+;CHECK: vpadd
+;CHECK: vscatterqps
+;CHECK: ret
+define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_qpd
+;CHECK: kmovw
+;CHECK: vgatherqpd
+;CHECK: vpadd
+;CHECK: vscatterqpd
+;CHECK: ret
+define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
+  ret void
+}
+;;
+;; Integer Gather/Scatter
+;;
+declare <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32>, i16, <16 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dpi.mask.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64>, i8, <8 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dpq.mask.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
+
+declare <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32>, i8, <8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpi.mask.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64>, i8, <8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
+
+;CHECK-LABEL: gather_mask_dd
+;CHECK: kmovw
+;CHECK: vpgatherdd
+;CHECK: vpadd
+;CHECK: vpscatterdd
+;CHECK: ret
+define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dpi.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_qd
+;CHECK: kmovw
+;CHECK: vpgatherqd
+;CHECK: vpadd
+;CHECK: vpscatterqd
+;CHECK: ret
+define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpi.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_qq
+;CHECK: kmovw
+;CHECK: vpgatherqq
+;CHECK: vpadd
+;CHECK: vpscatterqq
+;CHECK: ret
+define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_dq
+;CHECK: kmovw
+;CHECK: vpgatherdq
+;CHECK: vpadd
+;CHECK: vpscatterdq
+;CHECK: ret
+define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
+  ret void
+}
+
+;; FP Intinsics without masks
+
+declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dps.512 (i8*, <16 x i32>, <16 x float>, i32)
+declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qps.512 (i8*, <8 x i64>, <8 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, <8 x i64>, <8 x double>, i32)
+
+;CHECK-LABEL: gather_dps
+;CHECK: kxnorw
+;CHECK: vgatherdps
+;CHECK: vscatterdps
+;CHECK: ret
+define void @gather_dps(<16 x i32> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, <16 x i32>%ind2, <16 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_qps
+;CHECK: kxnorw
+;CHECK: vgatherqps
+;CHECK: vscatterqps
+;CHECK: ret
+define void @gather_qps(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, <8 x i64>%ind2, <8 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_qpd
+;CHECK: kxnorw
+;CHECK: vgatherqpd
+;CHECK: vpadd
+;CHECK: vscatterqpd
+;CHECK: ret
+define void @gather_qpd(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, <8 x i64>%ind2, <8 x double> %x, i32 4)
+  ret void
+}
+
+;; Integer Intinsics without masks
+
+declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, <16 x i32>, <16 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i32>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, <8 x i32>, <8 x i64>, i32)
+
+declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, <8 x i64>, <8 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, i32)
+declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, <8 x i64>, <8 x i64>, i32)
+
+;CHECK-LABEL: gather_dpi
+;CHECK: kxnorw
+;CHECK: vpgatherdd
+;CHECK: vpscatterdd
+;CHECK: ret
+define void @gather_dpi(<16 x i32> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>%ind, i8* %base, i32 4)
+  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, <16 x i32>%ind2, <16 x i32> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_qpq
+;CHECK: vpxord  %zmm
+;CHECK: kxnorw
+;CHECK: vpgatherqq
+;CHECK: vpadd
+;CHECK: vpscatterqq
+;CHECK: ret
+define void @gather_qpq(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i64> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_qpi
+;CHECK: vpxor %ymm
+;CHECK: kxnorw
+;CHECK: vpgatherqd
+;CHECK: vpadd
+;CHECK: vpscatterqd
+;CHECK: ret
+define void @gather_qpi(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
+  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>%ind, i8* %base, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i32> %x, i32 4)
+  ret void
+}
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 189bdd7..3f06740 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-;CHECK: test1
+;CHECK-LABEL: test1:
 ;CHECK: vinsertps
 ;CHECK: vinsertf32x4
 ;CHECK: ret
@@ -11,7 +11,7 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
   ret <16 x float> %rrr3
 }
 
-;CHECK: test2
+;CHECK-LABEL: test2:
 ;CHECK: vinsertf32x4
 ;CHECK: vextractf32x4
 ;CHECK: vinsertf32x4
@@ -23,7 +23,7 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
   ret <8 x double> %rrr3
 }
 
-;CHECK: test3
+;CHECK-LABEL: test3:
 ;CHECK: vextractf32x4
 ;CHECK: vinsertf32x4
 ;CHECK: ret
@@ -33,7 +33,7 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
   ret <16 x float> %rrr2
 }
 
-;CHECK: test4
+;CHECK-LABEL: test4:
 ;CHECK: vextracti32x4
 ;CHECK: vinserti32x4
 ;CHECK: ret
@@ -43,7 +43,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
   ret <8 x i64> %rrr2
 }
 
-;CHECK: test5
+;CHECK-LABEL: test5:
 ;CHECK: vextractpsz
 ;CHECK: ret
 define i32 @test5(<4 x float> %x) nounwind {
@@ -52,7 +52,7 @@ define i32 @test5(<4 x float> %x) nounwind {
   ret i32 %ei
 }
 
-;CHECK: test6
+;CHECK-LABEL: test6:
 ;CHECK: vextractpsz {{.*}}, (%rdi)
 ;CHECK: ret
 define void @test6(<4 x float> %x, float* %out) nounwind {
@@ -61,3 +61,65 @@ define void @test6(<4 x float> %x, float* %out) nounwind {
   ret void
 }
 
+;CHECK-LABEL: test7
+;CHECK: vmovdz
+;CHECK: vpermps %zmm
+;CHECK: ret
+define float @test7(<16 x float> %x, i32 %ind) nounwind {
+  %e = extractelement <16 x float> %x, i32 %ind
+  ret float %e
+}
+
+;CHECK-LABEL: test8
+;CHECK: vmovqz
+;CHECK: vpermpd %zmm
+;CHECK: ret
+define double @test8(<8 x double> %x, i32 %ind) nounwind {
+  %e = extractelement <8 x double> %x, i32 %ind
+  ret double %e
+}
+
+;CHECK-LABEL: test9
+;CHECK: vmovd
+;CHECK: vpermps %ymm
+;CHECK: ret
+define float @test9(<8 x float> %x, i32 %ind) nounwind {
+  %e = extractelement <8 x float> %x, i32 %ind
+  ret float %e
+}
+
+;CHECK-LABEL: test10
+;CHECK: vmovdz
+;CHECK: vpermd %zmm
+;CHEKK: vmovdz  %xmm0, %eax
+;CHECK: ret
+define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
+  %e = extractelement <16 x i32> %x, i32 %ind
+  ret i32 %e
+}
+
+;CHECK-LABEL: test11
+;CHECK: movl    $260
+;CHECK: bextrl
+;CHECK: movl    $268
+;CHECK: bextrl
+;CHECK: ret
+define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
+  %cmp_res = icmp ult <16 x i32> %a, %b
+  %ia = extractelement <16 x i1> %cmp_res, i32 4
+  %ib = extractelement <16 x i1> %cmp_res, i32 12
+
+  br i1 %ia, label %A, label %B
+
+  A:
+    ret <16 x i32>%b
+  B:
+   %c = add <16 x i32>%b, %a
+  br i1 %ib, label %C, label %D
+  C:
+   %c1 = sub <16 x i32>%c, %a
+   ret <16 x i32>%c1
+  D:
+   %c2 = mul <16 x i32>%c, %a
+   ret <16 x i32>%c2
+}
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
new file mode 100644
index 0000000..5bdabf2
--- /dev/null
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -0,0 +1,374 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+declare i32 @llvm.x86.avx512.kortestz(i16, i16) nounwind readnone
+; CHECK: test_kortestz
+; CHECK: kortestw
+; CHECK: sete
+define i32 @test_kortestz(i16 %a0, i16 %a1) {
+  %res = call i32 @llvm.x86.avx512.kortestz(i16 %a0, i16 %a1) 
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.kortestc(i16, i16) nounwind readnone
+; CHECK: test_kortestc
+; CHECK: kortestw
+; CHECK: sbbl
+define i32 @test_kortestc(i16 %a0, i16 %a1) {
+  %res = call i32 @llvm.x86.avx512.kortestc(i16 %a0, i16 %a1) 
+  ret i32 %res
+}
+
+define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
+  ; CHECK: vrcp14ps
+  %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>) nounwind readnone
+
+define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
+  ; CHECK: vrcp14pd
+  %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>) nounwind readnone
+
+define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
+  ; CHECK: vrcp28ps
+  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rcp28.ps.512(<16 x float>) nounwind readnone
+
+define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
+  ; CHECK: vrcp28pd
+  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.rcp28.pd.512(<8 x double>) nounwind readnone
+
+define <8 x double> @test_rndscale_pd_512(<8 x double> %a0) {
+  ; CHECK: vrndscale
+  %res = call <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double> %a0, i32 7) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double>, i32) nounwind readnone
+
+
+define <16 x float> @test_rndscale_ps_512(<16 x float> %a0) {
+  ; CHECK: vrndscale
+  %res = call <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float> %a0, i32 7) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float>, i32) nounwind readnone
+
+
+define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
+  ; CHECK: vrsqrt14ps
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>) nounwind readnone
+
+define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) {
+  ; CHECK: vrsqrt28ps
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rsqrt28.ps.512(<16 x float>) nounwind readnone
+
+define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
+  ; CHECK: vrsqrt14ss
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>) nounwind readnone
+
+define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
+  ; CHECK: vrsqrt28ss
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>) nounwind readnone
+
+define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
+  ; CHECK: vrcp14ss
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>) nounwind readnone
+
+define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
+  ; CHECK: vrcp28ss
+  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>) nounwind readnone
+
+define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
+  ; CHECK: vsqrtpd
+  %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>) nounwind readnone
+
+define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
+  ; CHECK: vsqrtps
+  %res = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone
+
+define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: vsqrtssz
+  %res = call <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: vsqrtsdz
+  %res = call <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
+  ; CHECK: vcvtsd2siz
+  %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
+
+define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
+  ; CHECK: vcvtsi2sdqz
+  %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
+
+define <2 x double> @test_x86_avx512_cvtusi642sd(<2 x double> %a0, i64 %a1) {
+  ; CHECK: vcvtusi2sdqz
+  %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double>, i64) nounwind readnone
+
+define i64 @test_x86_sse2_cvttsd2si64(<2 x double> %a0) {
+  ; CHECK: vcvttsd2siz
+  %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
+
+
+define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
+  ; CHECK: vcvtss2siz
+  %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
+  ; CHECK: vcvtsi2ssqz
+  %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
+
+
+define i64 @test_x86_sse_cvttss2si64(<4 x float> %a0) {
+  ; CHECK: vcvttss2siz
+  %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
+
+define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
+  ; CHECK: vcvtsd2usiz
+  %res = call i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double>) nounwind readnone
+
+define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
+  ; CHECK: vcvtph2ps
+  %res = call <16 x float> @llvm.x86.avx512.vcvtph2ps.512(<16 x i16> %a0)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.vcvtph2ps.512(<16 x i16>) nounwind readonly
+
+
+define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) {
+  ; CHECK: vcvtps2ph
+  %res = call <16 x i16> @llvm.x86.avx512.vcvtps2ph.512(<16 x float> %a0, i32 0)
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx512.vcvtps2ph.512(<16 x float>, i32) nounwind readonly
+
+define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) {
+  ; CHECK: vbroadcastss
+  %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8* %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8*) nounwind readonly
+
+define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
+  ; CHECK: vbroadcastsd
+  %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
+
+define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0) {
+  ; CHECK: vbroadcastss
+  %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float> %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.ps.512(<4 x float>) nounwind readonly
+
+define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0) {
+  ; CHECK: vbroadcastsd
+  %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double> %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.pd.512(<2 x double>) nounwind readonly
+
+define <16 x i32> @test_x86_pbroadcastd_512(<4 x i32>  %a0) {
+  ; CHECK: vpbroadcastd
+  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %a0) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>) nounwind readonly
+
+define <16 x i32> @test_x86_pbroadcastd_i32_512(i32  %a0) {
+  ; CHECK: vpbroadcastd
+  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32 %a0) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pbroadcastd.i32.512(i32) nounwind readonly
+
+define <8 x i64> @test_x86_pbroadcastq_512(<2 x i64> %a0) {
+  ; CHECK: vpbroadcastq
+  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %a0) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>) nounwind readonly
+
+define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) {
+  ; CHECK: vpbroadcastq
+  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64 %a0) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly
+
+define <16 x i32> @test_x86_pmaxu_d(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpmaxud 
+  %res = call <16 x i32> @llvm.x86.avx512.pmaxu.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pmaxu.d(<16 x i32>, <16 x i32>) nounwind readonly
+
+define <8 x i64> @test_x86_pmaxu_q(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vpmaxuq
+  %res = call <8 x i64> @llvm.x86.avx512.pmaxu.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pmaxu.q(<8 x i64>, <8 x i64>) nounwind readonly
+
+define <16 x i32> @test_x86_pmaxs_d(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpmaxsd
+  %res = call <16 x i32> @llvm.x86.avx512.pmaxs.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pmaxs.d(<16 x i32>, <16 x i32>) nounwind readonly
+
+define <8 x i64> @test_x86_pmaxs_q(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vpmaxsq
+  %res = call <8 x i64> @llvm.x86.avx512.pmaxs.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pmaxs.q(<8 x i64>, <8 x i64>) nounwind readonly
+
+define <16 x i32> @test_x86_pminu_d(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpminud
+  %res = call <16 x i32> @llvm.x86.avx512.pminu.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pminu.d(<16 x i32>, <16 x i32>) nounwind readonly
+
+define <8 x i64> @test_x86_pminu_q(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vpminuq
+  %res = call <8 x i64> @llvm.x86.avx512.pminu.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pminu.q(<8 x i64>, <8 x i64>) nounwind readonly
+
+define <16 x i32> @test_x86_pmins_d(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpminsd
+  %res = call <16 x i32> @llvm.x86.avx512.pmins.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.pmins.d(<16 x i32>, <16 x i32>) nounwind readonly
+
+define <8 x i64> @test_x86_pmins_q(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vpminsq
+  %res = call <8 x i64> @llvm.x86.avx512.pmins.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.pmins.q(<8 x i64>, <8 x i64>) nounwind readonly
+
+define <16 x i32> @test_conflict_d(<16 x i32> %a) {
+  ; CHECK: vpconflictd
+  %res = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a)
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32>) nounwind readonly
+
+define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
+  ; CHECK: vpconflictd %zmm0, %zmm0 {%k1} {z}
+  %vmask = bitcast i16 %mask to <16 x i1>
+  %res = call <16 x i32> @llvm.x86.avx512.conflict.d.maskz.512(<16 x i1> %vmask, <16 x i32> %a)
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.conflict.d.maskz.512(<16 x i1>,<16 x i32>) nounwind readonly
+
+define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+  ; CHECK: vpconflictq {{.*}} {%k1}
+  %vmask = bitcast i8 %mask to <8 x i1>
+  %res = call <8 x i64> @llvm.x86.avx512.conflict.q.mask.512(<8 x i64> %b, <8 x i1> %vmask, <8 x i64> %a)
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.conflict.q.mask.512(<8 x i64>, <8 x i1>,<8 x i64>) nounwind readonly
+
+define <16 x float> @test_x86_mskblend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK: vblendmps
+  %m0 = bitcast i16 %a0 to <16 x i1>
+  %res = call <16 x float> @llvm.x86.avx512.mskblend.ps.512(<16 x i1> %m0, <16 x float> %a1, <16 x float> %a2) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mskblend.ps.512(<16 x i1> %a0, <16 x float> %a1, <16 x float> %a2) nounwind readonly
+
+define <8 x double> @test_x86_mskblend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK: vblendmpd
+  %m0 = bitcast i8 %a0 to <8 x i1>
+  %res = call <8 x double> @llvm.x86.avx512.mskblend.pd.512(<8 x i1> %m0, <8 x double> %a1, <8 x double> %a2) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mskblend.pd.512(<8 x i1> %a0, <8 x double> %a1, <8 x double> %a2) nounwind readonly
+
+define <16 x i32> @test_x86_mskblend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
+  ; CHECK: vpblendmd
+  %m0 = bitcast i16 %a0 to <16 x i1>
+  %res = call <16 x i32> @llvm.x86.avx512.mskblend.d.512(<16 x i1> %m0, <16 x i32> %a1, <16 x i32> %a2) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.x86.avx512.mskblend.d.512(<16 x i1> %a0, <16 x i32> %a1, <16 x i32> %a2) nounwind readonly
+
+define <8 x i64> @test_x86_mskblend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+  ; CHECK: vpblendmq
+  %m0 = bitcast i8 %a0 to <8 x i1>
+  %res = call <8 x i64> @llvm.x86.avx512.mskblend.q.512(<8 x i1> %m0, <8 x i64> %a1, <8 x i64> %a2) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+declare <8 x i64> @llvm.x86.avx512.mskblend.q.512(<8 x i1> %a0, <8 x i64> %a1, <8 x i64> %a2) nounwind readonly
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index eec8873..ef5cb56 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -27,8 +27,8 @@ define i16 @mand16(i16 %x, i16 %y) {
   %md = xor <16 x i1> %ma, %mb
   %me = or <16 x i1> %mc, %md
   %ret = bitcast <16 x i1> %me to i16
-; CHECK: kxorw
 ; CHECK: kandw
+; CHECK: kxorw
 ; CHECK: korw
   ret i16 %ret
 }
@@ -55,4 +55,3 @@ define i8 @shuf_test1(i16 %v) nounwind {
    %mask1 = bitcast <8 x i1> %mask to i8
    ret i8 %mask1
 }
-
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
new file mode 100644
index 0000000..91242b1
--- /dev/null
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: @test1
+; CHECK: vmovdz  %xmm0, %eax
+; CHECK: ret
+define i32 @test1(float %x) {
+   %res = bitcast float %x to i32
+   ret i32 %res
+}
+
+; CHECK-LABEL: @test2
+; CHECK: vmovdz  %edi
+; CHECK: ret
+define <4 x i32> @test2(i32 %x) {
+   %res = insertelement <4 x i32>undef, i32 %x, i32 0
+   ret <4 x i32>%res
+}
+
+; CHECK-LABEL: @test3
+; CHECK: vmovqz  %rdi
+; CHECK: ret
+define <2 x i64> @test3(i64 %x) {
+   %res = insertelement <2 x i64>undef, i64 %x, i32 0
+   ret <2 x i64>%res
+}
+
+; CHECK-LABEL: @test4
+; CHECK: vmovdz  (%rdi)
+; CHECK: ret
+define <4 x i32> @test4(i32* %x) {
+   %y = load i32* %x
+   %res = insertelement <4 x i32>undef, i32 %y, i32 0
+   ret <4 x i32>%res
+}
+
+; CHECK-LABEL: @test5
+; CHECK: vmovssz  %xmm0, (%rdi)
+; CHECK: ret
+define void @test5(float %x, float* %y) {
+   store float %x, float* %y, align 4
+   ret void
+}
+
+; CHECK-LABEL: @test6
+; CHECK: vmovsdz  %xmm0, (%rdi)
+; CHECK: ret
+define void @test6(double %x, double* %y) {
+   store double %x, double* %y, align 8
+   ret void
+}
+
+; CHECK-LABEL: @test7
+; CHECK: vmovssz  (%rdi), %xmm0
+; CHECK: ret
+define float @test7(i32* %x) {
+   %y = load i32* %x
+   %res = bitcast i32 %y to float
+   ret float %res
+}
+
+; CHECK-LABEL: @test8
+; CHECK: vmovdz %xmm0, %eax
+; CHECK: ret
+define i32 @test8(<4 x i32> %x) {
+   %res = extractelement <4 x i32> %x, i32 0
+   ret i32 %res
+}
+
+; CHECK-LABEL: @test9
+; CHECK: vmovqz %xmm0, %rax
+; CHECK: ret
+define i64 @test9(<2 x i64> %x) {
+   %res = extractelement <2 x i64> %x, i32 0
+   ret i64 %res
+}
+
+; CHECK-LABEL: @test10
+; CHECK: vmovdz  (%rdi)
+; CHECK: ret
+define <4 x i32> @test10(i32* %x) {
+   %y = load i32* %x, align 4
+   %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
+   ret <4 x i32>%res
+}
+
+; CHECK-LABEL: @test11
+; CHECK: vmovssz  (%rdi)
+; CHECK: ret
+define <4 x float> @test11(float* %x) {
+   %y = load float* %x, align 4
+   %res = insertelement <4 x float>zeroinitializer, float %y, i32 0
+   ret <4 x float>%res
+}
+
+; CHECK-LABEL: @test12
+; CHECK: vmovsdz  (%rdi)
+; CHECK: ret
+define <2 x double> @test12(double* %x) {
+   %y = load double* %x, align 8
+   %res = insertelement <2 x double>zeroinitializer, double %y, i32 0
+   ret <2 x double>%res
+}
+
+; CHECK-LABEL: @test13
+; CHECK: vmovqz  %rdi
+; CHECK: ret
+define <2 x i64> @test13(i64 %x) {
+   %res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0
+   ret <2 x i64>%res
+}
+
+; CHECK-LABEL: @test14
+; CHECK: vmovdz  %edi
+; CHECK: ret
+define <4 x i32> @test14(i32 %x) {
+   %res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0
+   ret <4 x i32>%res
+}
+
+; CHECK-LABEL: @test15
+; CHECK: vmovdz  (%rdi)
+; CHECK: ret
+define <4 x i32> @test15(i32* %x) {
+   %y = load i32* %x, align 4
+   %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
+   ret <4 x i32>%res
+}
+
+; CHECK-LABEL: test16
+; CHECK: vmovdqu32
+; CHECK: ret
+define <16 x i32> @test16(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %res = load <16 x i32>* %vaddr, align 1
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test17
+; CHECK: vmovdqa32
+; CHECK: ret
+define <16 x i32> @test17(i8 * %addr) {
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %res = load <16 x i32>* %vaddr, align 64
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test18
+; CHECK: vmovdqa64
+; CHECK: ret
+define void @test18(i8 * %addr, <8 x i64> %data) {
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  store <8 x i64>%data, <8 x i64>* %vaddr, align 64
+  ret void
+}
+
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
new file mode 100644
index 0000000..d2d6681
--- /dev/null
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl  | FileCheck %s
+
+; CHECK-LABEL: select00
+; CHECK: vmovaps
+; CHECK-NEXT: LBB
+define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
+  %cmpres = icmp eq i32 %a, 255
+  %selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b
+  %res = xor <16 x i32> %b, %selres
+  ret <16 x i32> %res
+}
+
+; CHECK-LABEL: select01
+; CHECK: vmovaps
+; CHECK-NEXT: LBB
+define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
+  %cmpres = icmp eq i32 %a, 255
+  %selres = select i1 %cmpres, <8 x i64> zeroinitializer, <8 x i64> %b
+  %res = xor <8 x i64> %b, %selres
+  ret <8 x i64> %res
+}
+
diff --git a/test/CodeGen/X86/avx512-shift.ll b/test/CodeGen/X86/avx512-shift.ll
new file mode 100644
index 0000000..8cdcf8a
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shift.ll
@@ -0,0 +1,108 @@
+;RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+;CHECK-LABEL: shift_16_i32
+;CHECK: vpsrld
+;CHECK: vpslld
+;CHECK: vpsrad
+;CHECK: ret
+define <16 x i32> @shift_16_i32(<16 x i32> %a) {
+   %b = lshr <16 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+   %c = shl <16 x i32> %b, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
+   %d = ashr <16 x i32> %c, <i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12, i32 12>
+   ret <16 x i32> %d;
+}
+
+;CHECK-LABEL: shift_8_i64
+;CHECK: vpsrlq
+;CHECK: vpsllq
+;CHECK: vpsraq
+;CHECK: ret
+define <8 x i64> @shift_8_i64(<8 x i64> %a) {
+   %b = lshr <8 x i64> %a, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+   %c = shl <8 x i64> %b,  <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
+   %d = ashr <8 x i64> %c, <i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12, i64 12>
+   ret <8 x i64> %d;
+}
+
+; CHECK-LABEL: variable_shl4
+; CHECK: vpsllvq %zmm
+; CHECK: ret
+define <8 x i64> @variable_shl4(<8 x i64> %x, <8 x i64> %y) {
+  %k = shl <8 x i64> %x, %y
+  ret <8 x i64> %k
+}
+
+; CHECK-LABEL: variable_shl5
+; CHECK: vpsllvd %zmm
+; CHECK: ret
+define <16 x i32> @variable_shl5(<16 x i32> %x, <16 x i32> %y) {
+  %k = shl <16 x i32> %x, %y
+  ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_srl0
+; CHECK: vpsrlvd
+; CHECK: ret
+define <16 x i32> @variable_srl0(<16 x i32> %x, <16 x i32> %y) {
+  %k = lshr <16 x i32> %x, %y
+  ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_srl2
+; CHECK: psrlvq
+; CHECK: ret
+define <8 x i64> @variable_srl2(<8 x i64> %x, <8 x i64> %y) {
+  %k = lshr <8 x i64> %x, %y
+  ret <8 x i64> %k
+}
+
+; CHECK-LABEL: variable_sra1
+; CHECK: vpsravd
+; CHECK: ret
+define <16 x i32> @variable_sra1(<16 x i32> %x, <16 x i32> %y) {
+  %k = ashr <16 x i32> %x, %y
+  ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_sra2
+; CHECK: vpsravq %zmm
+; CHECK: ret
+define <8 x i64> @variable_sra2(<8 x i64> %x, <8 x i64> %y) {
+  %k = ashr <8 x i64> %x, %y
+  ret <8 x i64> %k
+}
+
+; CHECK-LABEL: variable_sra01_load
+; CHECK: vpsravd (%
+; CHECK: ret
+define <16 x i32> @variable_sra01_load(<16 x i32> %x, <16 x i32>* %y) {
+  %y1 = load <16 x i32>* %y
+  %k = ashr <16 x i32> %x, %y1
+  ret <16 x i32> %k
+}
+
+; CHECK-LABEL: variable_shl1_load
+; CHECK: vpsllvd (%
+; CHECK: ret
+define <16 x i32> @variable_shl1_load(<16 x i32> %x, <16 x i32>* %y) {
+  %y1 = load <16 x i32>* %y
+  %k = shl <16 x i32> %x, %y1
+  ret <16 x i32> %k
+}
+; CHECK: variable_srl0_load
+; CHECK: vpsrlvd (%
+; CHECK: ret
+define <16 x i32> @variable_srl0_load(<16 x i32> %x, <16 x i32>* %y) {
+  %y1 = load <16 x i32>* %y
+  %k = lshr <16 x i32> %x, %y1
+  ret <16 x i32> %k
+}
+
+; CHECK: variable_srl3_load
+; CHECK: vpsrlvq (%
+; CHECK: ret
+define <8 x i64> @variable_srl3_load(<8 x i64> %x, <8 x i64>* %y) {
+  %y1 = load <8 x i64>* %y
+  %k = lshr <8 x i64> %x, %y1
+  ret <8 x i64> %k
+}
diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
new file mode 100644
index 0000000..c9e0c2b
--- /dev/null
+++ b/test/CodeGen/X86/avx512-shuffle.ll
@@ -0,0 +1,226 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; CHECK: LCP
+; CHECK: .long 2
+; CHECK: .long 5
+; CHECK: .long 0
+; CHECK: .long 0
+; CHECK: .long 7
+; CHECK: .long 0
+; CHECK: .long 10
+; CHECK: .long 1
+; CHECK: .long 0
+; CHECK: .long 5
+; CHECK: .long 0
+; CHECK: .long 4
+; CHECK: .long 7
+; CHECK: .long 0
+; CHECK: .long 10
+; CHECK: .long 1
+; CHECK-LABEL: test1:
+; CHECK: vpermps
+; CHECK: ret
+define <16 x float> @test1(<16 x float> %a) nounwind {
+  %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
+  ret <16 x float> %c
+}
+
+; CHECK-LABEL: test2:
+; CHECK: vpermd
+; CHECK: ret
+define <16 x i32> @test2(<16 x i32> %a) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: test3:
+; CHECK: vpermq
+; CHECK: ret
+define <8 x i64> @test3(<8 x i64> %a) nounwind {
+  %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1>
+  ret <8 x i64> %c
+}
+
+; CHECK-LABEL: test4:
+; CHECK: vpermpd
+; CHECK: ret
+define <8 x double> @test4(<8 x double> %a) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x double> %c
+}
+
+; CHECK-LABEL: test5:
+; CHECK: vpermi2pd
+; CHECK: ret
+define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
+  ret <8 x double> %c
+}
+
+; CHECK-LABEL: test6:
+; CHECK: vpermq $30
+; CHECK: ret
+define <8 x i64> @test6(<8 x i64> %a) nounwind {
+  %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+  ret <8 x i64> %c
+}
+
+; CHECK-LABEL: test7:
+; CHECK: vpermi2q
+; CHECK: ret
+define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
+  %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
+  ret <8 x i64> %c
+}
+
+; CHECK-LABEL: test8:
+; CHECK: vpermi2d
+; CHECK: ret
+define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: test9:
+; CHECK: vpermi2ps
+; CHECK: ret
+define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
+  %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x float> %c
+}
+
+; CHECK-LABEL: test10:
+; CHECK: vpermi2ps (
+; CHECK: ret
+define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
+  %c = load <16 x float>* %b
+  %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x float> %d
+}
+
+; CHECK-LABEL: test11:
+; CHECK: vpermi2d (
+; CHECK: ret
+define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
+  %c = load <16 x i32>* %b
+  %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
+  ret <16 x i32> %d
+}
+
+; CHECK-LABEL: test12
+; CHECK: vmovlhpsz %xmm
+; CHECK: ret
+define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind {
+  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x i32> %c
+}
+
+; CHECK-LABEL: test13
+; CHECK: vpermilps $-79, %zmm
+; CHECK: ret
+define <16 x float> @test13(<16 x float> %a) {
+ %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+ ret <16 x float> %b
+}
+
+; CHECK-LABEL: test14
+; CHECK: vpermilpd $-53, %zmm
+; CHECK: ret
+define <8 x double> @test14(<8 x double> %a) {
+ %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7>
+ ret <8 x double> %b
+}
+
+; CHECK-LABEL: test15
+; CHECK: vpshufd $-79, %zmm
+; CHECK: ret
+define <16 x i32> @test15(<16 x i32> %a) {
+ %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+ ret <16 x i32> %b
+}
+; CHECK-LABEL: test16
+; CHECK: valignq $2, %zmm0, %zmm1
+; CHECK: ret
+define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  ret <8 x double> %c
+}
+
+; CHECK-LABEL: test17
+; CHECK: vshufpd $19, %zmm1, %zmm0
+; CHECK: ret
+define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind {
+  %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef>
+  ret <8 x double> %c
+}
+
+; CHECK-LABEL: test18
+; CHECK: vpunpckhdq %zmm
+; CHECK: ret
+define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) {
+ %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15, i32 18, i32 26, i32 19, i32 27, i32 22, i32 30, i32 23, i32 31>
+ ret <16 x i32> %b
+}
+
+; CHECK-LABEL: test19
+; CHECK: vpunpckldq %zmm
+; CHECK: ret
+define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) {
+ %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29>
+ ret <16 x i32> %b
+}
+
+; CHECK-LABEL: test20
+; CHECK: vpunpckhqdq  %zmm
+; CHECK: ret
+define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) {
+ %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15>
+ ret <8 x i64> %b
+}
+
+; CHECK-LABEL: test21
+; CHECK: vunpcklps %zmm
+; CHECK: ret
+define <16 x float> @test21(<16 x float> %a, <16 x float> %c) {
+ %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13, i32 16, i32 24, i32 17, i32 25, i32 20, i32 28, i32 21, i32 29>
+ ret <16 x float> %b
+}
+
+; CHECK-LABEL: test22
+; CHECK: vmovhlpsz %xmm
+; CHECK: ret
+define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind {
+  %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x i32> %c
+}
+
+; CHECK-LABEL: @test23
+; CHECK: vshufps $-112, %zmm
+; CHECK: ret
+define <16 x float> @test23(<16 x float> %a, <16 x float> %c) {
+ %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30>
+ ret <16 x float> %b
+}
+
+; CHECK-LABEL: @test24
+; CHECK: vpermi2d
+; CHECK: ret
+define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: @test25
+; CHECK: vshufps  $52
+; CHECK: ret
+define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 undef, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: @test26
+; CHECK: vmovshdup
+; CHECK: ret
+define <16 x i32> @test26(<16 x i32> %a) nounwind {
+  %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef>
+  ret <16 x i32> %c
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-trunc-ext.ll
new file mode 100644
index 0000000..31db68c
--- /dev/null
+++ b/test/CodeGen/X86/avx512-trunc-ext.ll
@@ -0,0 +1,127 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: trunc_16x32_to_16x8
+; CHECK: vpmovdb
+; CHECK: ret
+define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) nounwind readnone {
+  %x = trunc <16 x i32> %i to <16 x i8>
+  ret <16 x i8> %x
+}
+
+; CHECK-LABEL: trunc_8x64_to_8x16
+; CHECK: vpmovqw
+; CHECK: ret
+define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) nounwind readnone {
+  %x = trunc <8 x i64> %i to <8 x i16>
+  ret <8 x i16> %x
+}
+
+
+; CHECK-LABEL: zext_16x8_to_16x32
+; CHECK; vpmovzxbd {{.*}}%zmm
+; CHECK: ret
+define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+  %x = zext <16 x i8> %i to <16 x i32>
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: sext_16x8_to_16x32
+; CHECK; vpmovsxbd {{.*}}%zmm
+; CHECK: ret
+define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+  %x = sext <16 x i8> %i to <16 x i32>
+  ret <16 x i32> %x
+}
+
+
+; CHECK-LABEL: zext_16x16_to_16x32
+; CHECK; vpmovzxwd {{.*}}%zmm
+; CHECK: ret
+define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %i) nounwind readnone {
+  %x = zext <16 x i16> %i to <16 x i32>
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: zext_8x16_to_8x64
+; CHECK; vpmovzxwq
+; CHECK: ret
+define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %i) nounwind readnone {
+  %x = zext <8 x i16> %i to <8 x i64>
+  ret <8 x i64> %x
+}
+
+;CHECK-LABEL: fptrunc_test
+;CHECK: vcvtpd2ps {{.*}}%zmm
+;CHECK: ret
+define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone {
+  %b = fptrunc <8 x double> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+;CHECK-LABEL: fpext_test
+;CHECK: vcvtps2pd {{.*}}%zmm
+;CHECK: ret
+define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone {
+  %b = fpext <8 x float> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+; CHECK-LABEL: zext_16i1_to_16xi32
+; CHECK: vpbroadcastd LCP{{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK: ret
+define   <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
+  %a = bitcast i16 %b to <16 x i1>
+  %c = zext <16 x i1> %a to <16 x i32>
+  ret <16 x i32> %c
+}
+
+; CHECK-LABEL: zext_8i1_to_8xi64
+; CHECK: vpbroadcastq LCP{{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK: ret
+define   <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
+  %a = bitcast i8 %b to <8 x i1>
+  %c = zext <8 x i1> %a to <8 x i64>
+  ret <8 x i64> %c
+}
+
+; CHECK-LABEL: trunc_16i8_to_16i1
+; CHECK: vpmovsxbd
+; CHECK: vpandd
+; CHECK: vptestmd
+; CHECK: ret
+define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
+  %mask_b = trunc <16 x i8>%a to <16 x i1>
+  %mask = bitcast <16 x i1> %mask_b to i16
+  ret i16 %mask
+}
+
+; CHECK-LABEL: trunc_16i32_to_16i1
+; CHECK: vpandd
+; CHECK: vptestmd
+; CHECK: ret
+define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
+  %mask_b = trunc <16 x i32>%a to <16 x i1>
+  %mask = bitcast <16 x i1> %mask_b to i16
+  ret i16 %mask
+}
+
+; CHECK-LABEL: trunc_8i16_to_8i1
+; CHECK: vpmovsxwq
+; CHECK: vpandq LCP{{.*}}(%rip){1to8}
+; CHECK: vptestmq
+; CHECK: ret
+define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
+  %mask_b = trunc <8 x i16>%a to <8 x i1>
+  %mask = bitcast <8 x i1> %mask_b to i8
+  ret i8 %mask
+}
+
+; CHECK: sext_8i1_8i32
+; CHECK: vpbroadcastq  LCP{{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK: ret
+define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+  %x = icmp slt <8 x i32> %a1, %a2
+  %x1 = xor <8 x i1>%x, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+  %y = sext <8 x i1> %x1 to <8 x i32>
+  ret <8 x i32> %y
+}
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 4f07f94..6f89d6c 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-;CHECK: _inreg16xi32
+;CHECK-LABEL: _inreg16xi32:
 ;CHECK: vpbroadcastd {{.*}}, %zmm
 ;CHECK: ret
 define   <16 x i32> @_inreg16xi32(i32 %a) {
@@ -9,7 +9,7 @@ define   <16 x i32> @_inreg16xi32(i32 %a) {
   ret <16 x i32> %c
 }
 
-;CHECK: _inreg8xi64
+;CHECK-LABEL: _inreg8xi64:
 ;CHECK: vpbroadcastq {{.*}}, %zmm
 ;CHECK: ret
 define   <8 x i64> @_inreg8xi64(i64 %a) {
@@ -18,7 +18,7 @@ define   <8 x i64> @_inreg8xi64(i64 %a) {
   ret <8 x i64> %c
 }
 
-;CHECK: _inreg16xfloat
+;CHECK-LABEL: _inreg16xfloat:
 ;CHECK: vbroadcastssz {{.*}}, %zmm
 ;CHECK: ret
 define   <16 x float> @_inreg16xfloat(float %a) {
@@ -27,7 +27,7 @@ define   <16 x float> @_inreg16xfloat(float %a) {
   ret <16 x float> %c
 }
 
-;CHECK: _inreg8xdouble
+;CHECK-LABEL: _inreg8xdouble:
 ;CHECK: vbroadcastsdz {{.*}}, %zmm
 ;CHECK: ret
 define   <8 x double> @_inreg8xdouble(double %a) {
@@ -35,3 +35,19 @@ define   <8 x double> @_inreg8xdouble(double %a) {
   %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
   ret <8 x double> %c
 }
+
+;CHECK-LABEL: _xmm16xi32
+;CHECK: vpbroadcastd
+;CHECK: ret
+define   <16 x i32> @_xmm16xi32(<16 x i32> %a) {
+  %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer
+  ret <16 x i32> %b
+}
+
+;CHECK-LABEL: _xmm16xfloat
+;CHECK: vbroadcastssz
+;CHECK: ret
+define   <16 x float> @_xmm16xfloat(<16 x float> %a) {
+  %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
+  ret <16 x float> %b
+}
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
new file mode 100644
index 0000000..6ca5bcc
--- /dev/null
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -0,0 +1,113 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK: vcmpleps
+; CHECK: vmovups
+; CHECK: ret
+define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
+	%mask = fcmp ole <16 x float> %x, %y
+	%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
+	ret <16 x float> %max
+}
+
+; CHECK-LABEL: test2
+; CHECK: vcmplepd
+; CHECK: vmovupd
+; CHECK: ret
+define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
+	%mask = fcmp ole <8 x double> %x, %y
+	%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
+	ret <8 x double> %max
+}
+
+; CHECK-LABEL: test3
+; CHECK: vpcmpeqd  (%rdi)
+; CHECK: vmovdqu32
+; CHECK: ret
+define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind {
+  %y = load <16 x i32>* %yp, align 4
+	%mask = icmp eq <16 x i32> %x, %y
+	%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+	ret <16 x i32> %max
+}
+
+; CHECK-LABEL: @test4_unsigned
+; CHECK: vpcmpnltud
+; CHECK: vmovdqu32
+; CHECK: ret
+define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind {
+	%mask = icmp uge <16 x i32> %x, %y
+	%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+	ret <16 x i32> %max
+}
+
+; CHECK-LABEL: test5
+; CHECK: vpcmpeqq {{.*}}%k1
+; CHECK: vmovdqu64 {{.*}}%k1
+; CHECK: ret
+define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
+	%mask = icmp eq <8 x i64> %x, %y
+	%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
+	ret <8 x i64> %max
+}
+
+; CHECK-LABEL: test6_unsigned
+; CHECK: vpcmpnleuq {{.*}}%k1
+; CHECK: vmovdqu64 {{.*}}%k1
+; CHECK: ret
+define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y) nounwind {
+	%mask = icmp ugt <8 x i64> %x, %y
+	%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
+	ret <8 x i64> %max
+}
+
+; CHECK-LABEL: test7
+; CHECK: xor
+; CHECK: vcmpltps
+; CHECK: vblendvps
+; CHECK: ret
+define <4 x float> @test7(<4 x float> %a, <4 x float> %b) {
+  %mask = fcmp olt <4 x float> %a, zeroinitializer
+  %c = select <4 x i1>%mask, <4 x float>%a, <4 x float>%b
+  ret <4 x float>%c
+}
+
+; CHECK-LABEL: test8
+; CHECK: xor
+; CHECK: vcmpltpd
+; CHECK: vblendvpd
+; CHECK: ret
+define <2 x double> @test8(<2 x double> %a, <2 x double> %b) {
+  %mask = fcmp olt <2 x double> %a, zeroinitializer
+  %c = select <2 x i1>%mask, <2 x double>%a, <2 x double>%b
+  ret <2 x double>%c
+}
+
+; CHECK-LABEL: test9
+; CHECK: vpcmpeqd
+; CHECK: vpblendmd
+; CHECK: ret
+define <8 x i32> @test9(<8 x i32> %x, <8 x i32> %y) nounwind {
+  %mask = icmp eq <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %max
+}
+
+; CHECK-LABEL: test10
+; CHECK: vcmpeqps
+; CHECK: vblendmps
+; CHECK: ret
+define <8 x float> @test10(<8 x float> %x, <8 x float> %y) nounwind {
+  %mask = fcmp oeq <8 x float> %x, %y
+  %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %max
+}
+
+; CHECK-LABEL: test11_unsigned
+; CHECK: vpmaxud
+; CHECK: ret
+define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
+  %mask = icmp ugt <8 x i32> %x, %y
+  %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %max
+}
diff --git a/test/CodeGen/X86/bc-extract.ll b/test/CodeGen/X86/bc-extract.ll
index ceabcb7..a1c0f5a 100644
--- a/test/CodeGen/X86/bc-extract.ll
+++ b/test/CodeGen/X86/bc-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse42 |  FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.2 |  FileCheck %s
 
 
 define float @extractFloat1() nounwind {
diff --git a/test/CodeGen/X86/bitcast2.ll b/test/CodeGen/X86/bitcast2.ll
index 48922b5..12aa863 100644
--- a/test/CodeGen/X86/bitcast2.ll
+++ b/test/CodeGen/X86/bitcast2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 | grep movd | count 2
-; RUN: llc < %s -march=x86-64 | not grep rsp
+; RUN: llc < %s -march=x86-64 -mattr=-avx | grep movd | count 2
+; RUN: llc < %s -march=x86-64 -mattr=-avx | not grep rsp
 
 define i64 @test1(double %A) {
    %B = bitcast double %A to i64
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index fa775bd..4f2060f 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 
 ; In this test we check that sign-extend of the mask bit is performed by
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index 4eda888..242075a 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -111,6 +111,23 @@ define i32 @bextr32_load(i32* %x, i32 %y) nounwind readnone {
 
 declare i32 @llvm.x86.bmi.bextr.32(i32, i32) nounwind readnone
 
+define i32 @bextr32b(i32 %x) nounwind uwtable readnone ssp {
+  %1 = lshr i32 %x, 4
+  %2 = and i32 %1, 4095
+  ret i32 %2
+; CHECK-LABEL: bextr32b:
+; CHECK: bextrl
+}
+
+define i32 @bextr32b_load(i32* %x) nounwind uwtable readnone ssp {
+  %1 = load i32* %x
+  %2 = lshr i32 %1, 4
+  %3 = and i32 %2, 4095
+  ret i32 %3
+; CHECK-LABEL: bextr32b_load:
+; CHECK: bextrl {{.*}}, ({{.*}}), {{.*}}
+}
+
 define i64 @bextr64(i64 %x, i64 %y) nounwind readnone {
   %tmp = tail call i64 @llvm.x86.bmi.bextr.64(i64 %x, i64 %y)
   ret i64 %tmp
@@ -120,6 +137,14 @@ define i64 @bextr64(i64 %x, i64 %y) nounwind readnone {
 
 declare i64 @llvm.x86.bmi.bextr.64(i64, i64) nounwind readnone
 
+define i64 @bextr64b(i64 %x) nounwind uwtable readnone ssp {
+  %1 = lshr i64 %x, 4
+  %2 = and i64 %1, 4095
+  ret i64 %2
+; CHECK-LABEL: bextr64b:
+; CHECK: bextrq
+}
+
 define i32 @bzhi32(i32 %x, i32 %y) nounwind readnone {
   %tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x, i32 %y)
   ret i32 %tmp
@@ -146,6 +171,51 @@ define i64 @bzhi64(i64 %x, i64 %y) nounwind readnone {
 
 declare i64 @llvm.x86.bmi.bzhi.64(i64, i64) nounwind readnone
 
+define i32 @bzhi32b(i32 %x, i8 zeroext %index) #0 {
+entry:
+  %conv = zext i8 %index to i32
+  %shl = shl i32 1, %conv
+  %sub = add nsw i32 %shl, -1
+  %and = and i32 %sub, %x
+  ret i32 %and
+; CHECK-LABEL: bzhi32b:
+; CHECK: bzhil
+}
+
+define i32 @bzhi32b_load(i32* %w, i8 zeroext %index) #0 {
+entry:
+  %x = load i32* %w
+  %conv = zext i8 %index to i32
+  %shl = shl i32 1, %conv
+  %sub = add nsw i32 %shl, -1
+  %and = and i32 %sub, %x
+  ret i32 %and
+; CHECK-LABEL: bzhi32b_load:
+; CHECK: bzhil {{.*}}, ({{.*}}), {{.*}}
+}
+
+define i32 @bzhi32c(i32 %x, i8 zeroext %index) #0 {
+entry:
+  %conv = zext i8 %index to i32
+  %shl = shl i32 1, %conv
+  %sub = add nsw i32 %shl, -1
+  %and = and i32 %x, %sub
+  ret i32 %and
+; CHECK-LABEL: bzhi32c:
+; CHECK: bzhil
+}
+
+define i64 @bzhi64b(i64 %x, i8 zeroext %index) #0 {
+entry:
+  %conv = zext i8 %index to i64
+  %shl = shl i64 1, %conv
+  %sub = add nsw i64 %shl, -1
+  %and = and i64 %x, %sub
+  ret i64 %and
+; CHECK-LABEL: bzhi64b:
+; CHECK: bzhiq
+}
+
 define i32 @blsi32(i32 %x) nounwind readnone {
   %tmp = sub i32 0, %x
   %tmp2 = and i32 %x, %tmp
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index fa6f6e8..a0a1c36 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx,+rdrand,+rdseed | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.1,-avx,+rdrnd,+rdseed | FileCheck %s
 
 define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
diff --git a/test/CodeGen/X86/break-anti-dependencies.ll b/test/CodeGen/X86/break-anti-dependencies.ll
index c942614..614d0ad 100644
--- a/test/CodeGen/X86/break-anti-dependencies.ll
+++ b/test/CodeGen/X86/break-anti-dependencies.ll
@@ -1,7 +1,7 @@
 ; Without list-burr scheduling we may not see the difference in codegen here.
 ; Use a subtarget that has post-RA scheduling enabled because the anti-dependency
 ; breaker requires liveness information to be kept.
-; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
+; RUN: llc < %s -march=x86-64 -mcpu=atom -enable-misched=false -post-RA-scheduler -pre-RA-sched=list-burr -break-anti-dependencies=none > %t
 ; RUN:   grep "%xmm0" %t | count 14
 ; RUN:   not grep "%xmm1" %t
 ; RUN: llc < %s -march=x86-64 -mcpu=atom -post-RA-scheduler -break-anti-dependencies=critical > %t
diff --git a/test/CodeGen/X86/break-avx-dep.ll b/test/CodeGen/X86/break-avx-dep.ll
new file mode 100644
index 0000000..210bda1
--- /dev/null
+++ b/test/CodeGen/X86/break-avx-dep.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
+;
+; rdar:15221834 False AVX register dependencies cause 5x slowdown on
+; flops-6. Make sure the unused register read by vcvtsi2sdq is zeroed
+; to avoid cyclic dependence on a write to the same register in a
+; previous iteration.
+
+; CHECK-LABEL: t1:
+; CHECK-LABEL: %loop
+; CHECK: vxorps %[[REG:xmm.]], %{{xmm.}}, %{{xmm.}}
+; CHECK: vcvtsi2sdq %{{r[0-9a-x]+}}, %[[REG]], %{{xmm.}}
+define i64 @t1(i64* nocapture %x, double* nocapture %y) nounwind {
+entry:
+  %vx = load i64* %x
+  br label %loop
+loop:
+  %i = phi i64 [ 1, %entry ], [ %inc, %loop ]
+  %s1 = phi i64 [ %vx, %entry ], [ %s2, %loop ]
+  %fi = sitofp i64 %i to double
+  %vy = load double* %y
+  %fipy = fadd double %fi, %vy
+  %iipy = fptosi double %fipy to i64
+  %s2 = add i64 %s1, %iipy
+  %inc = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %inc, 156250000
+  br i1 %exitcond, label %ret, label %loop
+ret:
+  ret i64 %s2
+}
diff --git a/test/CodeGen/X86/bswap.ll b/test/CodeGen/X86/bswap.ll
index 9e46592..e6a456c 100644
--- a/test/CodeGen/X86/bswap.ll
+++ b/test/CodeGen/X86/bswap.ll
@@ -1,6 +1,7 @@
 ; bswap should be constant folded when it is passed a constant argument
 
 ; RUN: llc < %s -march=x86 -mcpu=i686 | FileCheck %s
+; RUN: llc < %s -march=x86-64 | FileCheck %s --check-prefix=CHECK64
 
 declare i16 @llvm.bswap.i16(i16)
 
@@ -11,6 +12,9 @@ declare i64 @llvm.bswap.i64(i64)
 define i16 @W(i16 %A) {
 ; CHECK-LABEL: W:
 ; CHECK: rolw $8, %ax
+
+; CHECK64-LABEL: W:
+; CHECK64: rolw $8, %
         %Z = call i16 @llvm.bswap.i16( i16 %A )         ; <i16> [#uses=1]
         ret i16 %Z
 }
@@ -18,6 +22,9 @@ define i16 @W(i16 %A) {
 define i32 @X(i32 %A) {
 ; CHECK-LABEL: X:
 ; CHECK: bswapl %eax
+
+; CHECK64-LABEL: X:
+; CHECK64: bswapl %
         %Z = call i32 @llvm.bswap.i32( i32 %A )         ; <i32> [#uses=1]
         ret i32 %Z
 }
@@ -26,6 +33,9 @@ define i64 @Y(i64 %A) {
 ; CHECK-LABEL: Y:
 ; CHECK: bswapl %eax
 ; CHECK: bswapl %edx
+
+; CHECK64-LABEL: Y:
+; CHECK64: bswapq %
         %Z = call i64 @llvm.bswap.i64( i64 %A )         ; <i64> [#uses=1]
         ret i64 %Z
 }
@@ -33,9 +43,13 @@ define i64 @Y(i64 %A) {
 ; rdar://9164521
 define i32 @test1(i32 %a) nounwind readnone {
 entry:
-; CHECK: test1
-; CHECK: bswapl %eax
-; CHECK: shrl $16, %eax
+; CHECK-LABEL: test1:
+; CHECK: bswapl [[REG:%.*]]
+; CHECK: shrl $16, [[REG]]
+
+; CHECK64-LABEL: test1:
+; CHECK64: bswapl [[REG:%.*]]
+; CHECK64: shrl $16, [[REG]]
   %and = lshr i32 %a, 8
   %shr3 = and i32 %and, 255
   %and2 = shl i32 %a, 8
@@ -46,9 +60,13 @@ entry:
 
 define i32 @test2(i32 %a) nounwind readnone {
 entry:
-; CHECK: test2
-; CHECK: bswapl %eax
-; CHECK: sarl $16, %eax
+; CHECK-LABEL: test2:
+; CHECK: bswapl [[REG:%.*]]
+; CHECK: sarl $16, [[REG]]
+
+; CHECK64-LABEL: test2:
+; CHECK64: bswapl [[REG:%.*]]
+; CHECK64: sarl $16, [[REG]]
   %and = lshr i32 %a, 8
   %shr4 = and i32 %and, 255
   %and2 = shl i32 %a, 8
@@ -57,3 +75,80 @@ entry:
   %conv3 = ashr exact i32 %sext, 16
   ret i32 %conv3
 }
+
+@var8 = global i8 0
+@var16 = global i16 0
+
+; The "shl" below can move bits into the high parts of the value, so the
+; operation is not a "bswap, shr" pair.
+
+; rdar://problem/14814049
+define i64 @not_bswap() {
+; CHECK-LABEL: not_bswap:
+; CHECK-NOT: bswapl
+; CHECK: ret
+
+; CHECK64-LABEL: not_bswap:
+; CHECK64-NOT: bswapq
+; CHECK64: ret
+  %init = load i16* @var16
+  %big = zext i16 %init to i64
+
+  %hishifted = lshr i64 %big, 8
+  %loshifted = shl i64 %big, 8
+
+  %notswapped = or i64 %hishifted, %loshifted
+
+  ret i64 %notswapped
+}
+
+; This time, the lshr (and subsequent or) is completely useless. While it's
+; technically correct to convert this into a "bswap, shr", it's suboptimal. A
+; simple shl works better.
+
+define i64 @not_useful_bswap() {
+; CHECK-LABEL: not_useful_bswap:
+; CHECK-NOT: bswapl
+; CHECK: ret
+
+; CHECK64-LABEL: not_useful_bswap:
+; CHECK64-NOT: bswapq
+; CHECK64: ret
+
+  %init = load i8* @var8
+  %big = zext i8 %init to i64
+
+  %hishifted = lshr i64 %big, 8
+  %loshifted = shl i64 %big, 8
+
+  %notswapped = or i64 %hishifted, %loshifted
+
+  ret i64 %notswapped
+}
+
+; Finally, it *is* OK to just mask off the shl if we know that the value is zero
+; beyond 16 bits anyway. This is a legitimate bswap.
+
+define i64 @finally_useful_bswap() {
+; CHECK-LABEL: finally_useful_bswap:
+; CHECK: bswapl [[REG:%.*]]
+; CHECK: shrl $16, [[REG]]
+; CHECK: ret
+
+; CHECK64-LABEL: finally_useful_bswap:
+; CHECK64: bswapq [[REG:%.*]]
+; CHECK64: shrq $48, [[REG]]
+; CHECK64: ret
+
+  %init = load i16* @var16
+  %big = zext i16 %init to i64
+
+  %hishifted = lshr i64 %big, 8
+  %lomasked = and i64 %big, 255
+  %loshifted = shl i64 %lomasked, 8
+
+  %swapped = or i64 %hishifted, %loshifted
+
+  ret i64 %swapped
+}
+
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index e28923b..f12a354 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -38,7 +38,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @test2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: test2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -56,7 +56,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atest2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atest2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -74,7 +74,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atest2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atest2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
 	%tmp4 = icmp eq i32 %tmp3, 0		; <i1> [#uses=1]
@@ -91,7 +91,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @test3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: test3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -109,7 +109,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @test3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: test3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@@ -127,7 +127,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -145,7 +145,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -163,7 +163,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atestne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atestne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -181,7 +181,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @atestne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: atestne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -199,7 +199,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -217,7 +217,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @testne3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: testne3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@@ -235,7 +235,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -253,7 +253,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -271,7 +271,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aquery2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aquery2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -289,7 +289,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aquery2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aquery2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -307,7 +307,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -325,7 +325,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@@ -343,7 +343,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3x(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3x
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -361,7 +361,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @query3bx(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: query3bx
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jae
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@@ -379,7 +379,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -397,7 +397,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -415,7 +415,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aqueryne2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aqueryne2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
@@ -433,7 +433,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @aqueryne2b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: aqueryne2b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = ashr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 1, %tmp29
@@ -451,7 +451,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -469,7 +469,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3b(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3b
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
@@ -487,7 +487,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3x(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3x
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, %x		; <i32> [#uses=1]
@@ -505,7 +505,7 @@ UnifiedReturnBlock:		; preds = %entry
 define void @queryne3bx(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: queryne3bx
-; CHECK: btl %eax, %ecx
+; CHECK: btl %e{{..}}, %e{{..}}
 ; CHECK: jb
 	%tmp29 = shl i32 1, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %x, %tmp29
diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll
index 8a96e41..42751d7 100644
--- a/test/CodeGen/X86/byval7.ll
+++ b/test/CodeGen/X86/byval7.ll
@@ -7,14 +7,14 @@
 define i32 @main() nounwind  {
 entry:
 ; CHECK-LABEL: main:
-; CHECK: movl $1, (%esp)
 ; CHECK: leal 16(%esp), %edi
 ; CHECK: leal 160(%esp), %esi
 ; CHECK: rep;movsl
+; CHECK: movl $1, (%esp)
 	%s = alloca %struct.S		; <%struct.S*> [#uses=2]
 	%tmp15 = getelementptr %struct.S* %s, i32 0, i32 0		; <<2 x i64>*> [#uses=1]
 	store <2 x i64> < i64 8589934595, i64 1 >, <2 x i64>* %tmp15, align 16
-	call void @t( i32 1, %struct.S* byval  %s ) nounwind 
+	call void @t( i32 1, %struct.S* byval  %s ) nounwind
 	ret i32 0
 }
 
diff --git a/test/CodeGen/X86/chain_order.ll b/test/CodeGen/X86/chain_order.ll
index 8c1c864..c88726e 100644
--- a/test/CodeGen/X86/chain_order.ll
+++ b/test/CodeGen/X86/chain_order.ll
@@ -3,8 +3,8 @@
 ;CHECK-LABEL: cftx020:
 ;CHECK: vmovsd  (%rdi), %xmm{{.*}}
 ;CHECK: vmovsd  16(%rdi), %xmm{{.*}}
-;CHECK: vmovhpd  8(%rdi), %xmm{{.*}}
 ;CHECK: vmovsd  24(%rdi), %xmm{{.*}}
+;CHECK: vmovhpd  8(%rdi), %xmm{{.*}}
 ;CHECK: vmovupd %xmm{{.*}}, (%rdi)
 ;CHECK: vmovupd %xmm{{.*}}, 16(%rdi)
 ;CHECK: ret
@@ -35,4 +35,3 @@ entry:
   store <2 x double> %14, <2 x double>* %15, align 8
   ret void
 }
-
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index 92c0445..215b862 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -4,8 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define i32 @test1(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK-LABEL: test1:
-; CHECK: movl	$12, %eax
-; CHECK-NEXT: btl
+; CHECK: btl
+; CHECK-NEXT: movl	$12, %eax
 ; CHECK-NEXT: cmovael	(%rcx), %eax
 ; CHECK-NEXT: ret
 
@@ -19,8 +19,8 @@ entry:
 define i32 @test2(i32 %x, i32 %n, i32 %w, i32* %vp) nounwind readnone {
 entry:
 ; CHECK-LABEL: test2:
-; CHECK: movl	$12, %eax
-; CHECK-NEXT: btl
+; CHECK: btl
+; CHECK-NEXT: movl	$12, %eax
 ; CHECK-NEXT: cmovbl	(%rcx), %eax
 ; CHECK-NEXT: ret
 
@@ -92,7 +92,7 @@ bb.i.i.i:                                         ; preds = %entry
 ; CHECK: testb
 ; CHECK-NOT: xor
 ; CHECK: setne
-; CHECK-NEXT: testb
+; CHECK: testb
 
 func_4.exit.i:                                    ; preds = %bb.i.i.i, %entry
   %.not.i = xor i1 %2, true                       ; <i1> [#uses=1]
diff --git a/test/CodeGen/X86/coalesce-implicitdef.ll b/test/CodeGen/X86/coalesce-implicitdef.ll
index 19cd08c..9be0452 100644
--- a/test/CodeGen/X86/coalesce-implicitdef.ll
+++ b/test/CodeGen/X86/coalesce-implicitdef.ll
@@ -26,7 +26,7 @@ for.cond:                                         ; preds = %for.inc34, %entry
   br i1 %tobool, label %for.end36, label %for.body
 
 for.body:                                         ; preds = %for.cond
-  store i32 0, i32* @c, align 4, !tbaa !0
+  store i32 0, i32* @c, align 4
   br label %for.body2
 
 for.body2:                                        ; preds = %for.body, %for.inc
@@ -35,7 +35,7 @@ for.body2:                                        ; preds = %for.body, %for.inc
   br i1 %tobool3, label %if.then10, label %if.then
 
 if.then:                                          ; preds = %for.body2
-  store i32 0, i32* %i, align 4, !tbaa !0
+  store i32 0, i32* %i, align 4
   br label %for.body6
 
 for.body6:                                        ; preds = %if.then, %for.body6
@@ -43,7 +43,7 @@ for.body6:                                        ; preds = %if.then, %for.body6
   br i1 true, label %for.body6, label %for.inc
 
 if.then10:                                        ; preds = %for.body2
-  store i32 1, i32* @b, align 4, !tbaa !0
+  store i32 1, i32* @b, align 4
   ret void
 
 for.inc:                                          ; preds = %for.body6
@@ -66,30 +66,30 @@ while.end:                                        ; preds = %while.cond
 
 for.inc27.backedge:                               ; preds = %while.end, %if.then22
   %inc28 = add nsw i32 %0, 1
-  store i32 %inc28, i32* @b, align 4, !tbaa !0
+  store i32 %inc28, i32* @b, align 4
   %tobool17 = icmp eq i32 %inc28, 0
   br i1 %tobool17, label %for.inc27.if.end30.loopexit56_crit_edge, label %while.condthread-pre-split
 
 if.then22:                                        ; preds = %while.end
-  %1 = load i16* %p2.1, align 2, !tbaa !3
+  %1 = load i16* %p2.1, align 2
   %tobool23 = icmp eq i16 %1, 0
   br i1 %tobool23, label %for.inc27.backedge, label %label.loopexit
 
 label.loopexit:                                   ; preds = %if.then22
-  store i32 %inc20, i32* @a, align 4, !tbaa !0
+  store i32 %inc20, i32* @a, align 4
   %inc2858 = add nsw i32 %0, 1
-  store i32 %inc2858, i32* @b, align 4, !tbaa !0
+  store i32 %inc2858, i32* @b, align 4
   %tobool1759 = icmp eq i32 %inc2858, 0
   br i1 %tobool1759, label %if.end30, label %while.condthread-pre-split
 
 for.inc27.if.end30.loopexit56_crit_edge:          ; preds = %for.inc27.backedge
-  store i32 %inc20, i32* @a, align 4, !tbaa !0
+  store i32 %inc20, i32* @a, align 4
   br label %if.end30
 
 if.end30:                                         ; preds = %for.inc27.if.end30.loopexit56_crit_edge, %label.loopexit, %label.preheader, %for.inc
   %i.0.load46 = phi i32 [ 0, %for.inc ], [ %i.0.load4669, %label.preheader ], [ %i.0.load4669, %label.loopexit ], [ %i.0.load4669, %for.inc27.if.end30.loopexit56_crit_edge ]
   %pi.4 = phi i32* [ %i, %for.inc ], [ %pi.3.ph, %label.preheader ], [ %pi.3.ph, %label.loopexit ], [ %pi.3.ph, %for.inc27.if.end30.loopexit56_crit_edge ]
-  %2 = load i32* %pi.4, align 4, !tbaa !0
+  %2 = load i32* %pi.4, align 4
   %tobool31 = icmp eq i32 %2, 0
   br i1 %tobool31, label %for.inc34, label %label.preheader
 
@@ -100,31 +100,26 @@ for.inc34:                                        ; preds = %if.end30
 
 for.end36:                                        ; preds = %for.cond
   store i32 1, i32* %i, align 4
-  %3 = load i32* @c, align 4, !tbaa !0
+  %3 = load i32* @c, align 4
   %tobool37 = icmp eq i32 %3, 0
   br i1 %tobool37, label %label.preheader, label %land.rhs
 
 land.rhs:                                         ; preds = %for.end36
-  store i32 0, i32* @a, align 4, !tbaa !0
+  store i32 0, i32* @a, align 4
   br label %label.preheader
 
 label.preheader:                                  ; preds = %for.end36, %if.end30, %land.rhs
   %i.0.load4669 = phi i32 [ 1, %land.rhs ], [ %i.0.load46, %if.end30 ], [ 1, %for.end36 ]
   %pi.3.ph = phi i32* [ %pi.0, %land.rhs ], [ %pi.4, %if.end30 ], [ %pi.0, %for.end36 ]
-  %4 = load i32* @b, align 4, !tbaa !0
+  %4 = load i32* @b, align 4
   %inc285863 = add nsw i32 %4, 1
-  store i32 %inc285863, i32* @b, align 4, !tbaa !0
+  store i32 %inc285863, i32* @b, align 4
   %tobool175964 = icmp eq i32 %inc285863, 0
   br i1 %tobool175964, label %if.end30, label %while.condthread-pre-split.lr.ph.lr.ph
 
 while.condthread-pre-split.lr.ph.lr.ph:           ; preds = %label.preheader
-  %.pr50 = load i32* @d, align 4, !tbaa !0
+  %.pr50 = load i32* @d, align 4
   %tobool19 = icmp eq i32 %.pr50, 0
-  %a.promoted.pre = load i32* @a, align 4, !tbaa !0
+  %a.promoted.pre = load i32* @a, align 4
   br label %while.condthread-pre-split
 }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"short", metadata !1}
diff --git a/test/CodeGen/X86/coff-feat00.ll b/test/CodeGen/X86/coff-feat00.ll
new file mode 100644
index 0000000..1dcd427
--- /dev/null
+++ b/test/CodeGen/X86/coff-feat00.ll
@@ -0,0 +1,7 @@
+; RUN: llc -O0 -mtriple=i386-pc-win32 -filetype=asm -o - %s | FileCheck %s
+
+define i32 @foo() {
+  ret i32 0
+}
+
+; CHECK: @feat.00 = 1
diff --git a/test/CodeGen/X86/commute-two-addr.ll b/test/CodeGen/X86/commute-two-addr.ll
index eb44e08..656c385 100644
--- a/test/CodeGen/X86/commute-two-addr.ll
+++ b/test/CodeGen/X86/commute-two-addr.ll
@@ -38,10 +38,10 @@ define i32 @t2(i32 %X, i32 %Y) nounwind {
 define %0 @t3(i32 %lb, i8 zeroext %has_lb, i8 zeroext %lb_inclusive, i32 %ub, i8 zeroext %has_ub, i8 zeroext %ub_inclusive) nounwind {
 entry:
 ; DARWIN-LABEL: t3:
-; DARWIN: shll $16
 ; DARWIN: shlq $32, %rcx
+; DARWIN-NEXT: orq %rcx, %rax
+; DARWIN-NEXT: shll $8
 ; DARWIN-NOT: leaq
-; DARWIN: orq %rcx, %rax
   %tmp21 = zext i32 %lb to i64
   %tmp23 = zext i32 %ub to i64
   %tmp24 = shl i64 %tmp23, 32
diff --git a/test/CodeGen/X86/compact-unwind.ll b/test/CodeGen/X86/compact-unwind.ll
index 8c4fa27..9d3a125 100644
--- a/test/CodeGen/X86/compact-unwind.ll
+++ b/test/CodeGen/X86/compact-unwind.ll
@@ -1,18 +1,29 @@
-; RUN: llc < %s -disable-cfi -disable-fp-elim -mtriple x86_64-apple-darwin11 | FileCheck %s
+; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 | FileCheck -check-prefix=ASM %s
+; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 -filetype=obj -o - \
+; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -s - \
+; RUN:  | FileCheck -check-prefix=CU %s
+; RUN: llc < %s -disable-fp-elim -mtriple x86_64-apple-darwin11 -mcpu corei7 \
+; RUN:  | llvm-mc -triple x86_64-apple-darwin11 -filetype=obj -o - \
+; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -s - \
+; RUN:  | FileCheck -check-prefix=FROM-ASM %s
 
 %ty = type { i8* }
 
 @gv = external global i32
 
 ; This is aligning the stack with a push of a random register.
-; CHECK: pushq %rax
+; ASM: pushq %rax
 
 ; Even though we can't encode %rax into the compact unwind, We still want to be
 ; able to generate a compact unwind encoding in this particular case.
-;
-; CHECK: __LD,__compact_unwind
-; CHECK: _foo ## Range Start
-; CHECK: 16842753 ## Compact Unwind Encoding: 0x1010001
+
+; CU:      Contents of section __compact_unwind:
+; CU-NEXT: 0020 00000000 00000000 1e000000 01000101
+; CU-NEXT: 0030 00000000 00000000 00000000 00000000
+
+; FROM-ASM:      Contents of section __compact_unwind:
+; FROM-ASM-NEXT: 0020 00000000 00000000 1e000000 01000101
+; FROM-ASM-NEXT: 0030 00000000 00000000 00000000 00000000
 
 define i8* @foo(i64 %size) {
   %addr = alloca i64, align 8
diff --git a/test/CodeGen/X86/crash-nosse.ll b/test/CodeGen/X86/crash-nosse.ll
index 7a15a47..b1e01f9 100644
--- a/test/CodeGen/X86/crash-nosse.ll
+++ b/test/CodeGen/X86/crash-nosse.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=corei7 -mattr=-sse2,-sse41 -verify-machineinstrs
+; RUN: llc < %s -mcpu=corei7 -mattr=-sse2,-sse4.1 -verify-machineinstrs
 target triple = "x86_64-unknown-linux-gnu"
 
 ; PR10503
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index b0a0e24..051150e 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -204,7 +204,7 @@ entry:
 ; <rdar://problem/9187792>
 define fastcc void @func_61() nounwind sspreq {
 entry:
-  %t1 = tail call i64 @llvm.objectsize.i64(i8* undef, i1 false)
+  %t1 = tail call i64 @llvm.objectsize.i64.p0i8(i8* undef, i1 false)
   %t2 = icmp eq i64 %t1, -1
   br i1 %t2, label %bb2, label %bb1
 
@@ -215,7 +215,7 @@ bb2:
   ret void
 }
 
-declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) nounwind readnone
 
 ; PR10277
 ; This test has dead code elimination caused by remat during spilling.
diff --git a/test/CodeGen/X86/dagcombine-shifts.ll b/test/CodeGen/X86/dagcombine-shifts.ll
new file mode 100644
index 0000000..905cf05
--- /dev/null
+++ b/test/CodeGen/X86/dagcombine-shifts.ll
@@ -0,0 +1,209 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
+
+; fold (shl (zext (lshr (A, X))), X) -> (zext (shl (lshr (A, X)), X))
+
+; Canolicalize the sequence shl/zext/lshr performing the zeroextend
+; as the last instruction of the sequence.
+; This will help DAGCombiner to identify and then fold the sequence
+; of shifts into a single AND.
+; This transformation is profitable if the shift amounts are the same
+; and if there is only one use of the zext.
+
+define i16 @fun1(i8 zeroext %v) {
+entry:
+  %shr = lshr i8 %v, 4
+  %ext = zext i8 %shr to i16
+  %shl = shl i16 %ext, 4
+  ret i16 %shl
+}
+
+; CHECK-LABEL: @fun1
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+define i32 @fun2(i8 zeroext %v) {
+entry:
+  %shr = lshr i8 %v, 4
+  %ext = zext i8 %shr to i32
+  %shl = shl i32 %ext, 4
+  ret i32 %shl
+}
+
+; CHECK-LABEL: @fun2
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+define i32 @fun3(i16 zeroext %v) {
+entry:
+  %shr = lshr i16 %v, 4
+  %ext = zext i16 %shr to i32
+  %shl = shl i32 %ext, 4
+  ret i32 %shl
+}
+
+; CHECK-LABEL: @fun3
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+define i64 @fun4(i8 zeroext %v) {
+entry:
+  %shr = lshr i8 %v, 4
+  %ext = zext i8 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun4
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+define i64 @fun5(i16 zeroext %v) {
+entry:
+  %shr = lshr i16 %v, 4
+  %ext = zext i16 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun5
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+define i64 @fun6(i32 zeroext %v) {
+entry:
+  %shr = lshr i32 %v, 4
+  %ext = zext i32 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun6
+; CHECK: and
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: ret
+
+; Don't fold the pattern if we use arithmetic shifts.
+
+define i64 @fun7(i8 zeroext %v) {
+entry:
+  %shr = ashr i8 %v, 4
+  %ext = zext i8 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun7
+; CHECK: sar
+; CHECK: shl
+; CHECK: ret
+
+define i64 @fun8(i16 zeroext %v) {
+entry:
+  %shr = ashr i16 %v, 4
+  %ext = zext i16 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun8
+; CHECK: sar
+; CHECK: shl
+; CHECK: ret
+
+define i64 @fun9(i32 zeroext %v) {
+entry:
+  %shr = ashr i32 %v, 4
+  %ext = zext i32 %shr to i64
+  %shl = shl i64 %ext, 4
+  ret i64 %shl
+}
+
+; CHECK-LABEL: @fun9
+; CHECK: sar
+; CHECK: shl
+; CHECK: ret
+
+; Don't fold the pattern if there is more than one use of the
+; operand in input to the shift left.
+
+define i64 @fun10(i8 zeroext %v) {
+entry:
+  %shr = lshr i8 %v, 4
+  %ext = zext i8 %shr to i64
+  %shl = shl i64 %ext, 4
+  %add = add i64 %shl, %ext
+  ret i64 %add
+}
+
+; CHECK-LABEL: @fun10
+; CHECK: shr
+; CHECK: shl
+; CHECK: ret
+
+define i64 @fun11(i16 zeroext %v) {
+entry:
+  %shr = lshr i16 %v, 4
+  %ext = zext i16 %shr to i64
+  %shl = shl i64 %ext, 4
+  %add = add i64 %shl, %ext
+  ret i64 %add
+}
+
+; CHECK-LABEL: @fun11
+; CHECK: shr
+; CHECK: shl
+; CHECK: ret
+
+define i64 @fun12(i32 zeroext %v) {
+entry:
+  %shr = lshr i32 %v, 4
+  %ext = zext i32 %shr to i64
+  %shl = shl i64 %ext, 4
+  %add = add i64 %shl, %ext
+  ret i64 %add
+}
+
+; CHECK-LABEL: @fun12
+; CHECK: shr
+; CHECK: shl
+; CHECK: ret
+
+; PR17380
+; Make sure that the combined dags are legal if we run the DAGCombiner after
+; Legalization took place. The add instruction is redundant and increases by 
+; one the number of uses of the zext. This prevents the transformation from
+; firing before dags are legalized and optimized.
+; Once the add is removed, the number of uses becomes one and therefore the
+; dags are canonicalized. After Legalization, we need to make sure that the
+; valuetype for the shift count is legal.
+; Verify also that we correctly fold the shl-shr sequence into an 
+; AND with bitmask.
+
+define void @g(i32 %a) {
+  %b = lshr i32 %a, 2
+  %c = zext i32 %b to i64
+  %d = add i64 %c, 1
+  %e = shl i64 %c, 2
+  tail call void @f(i64 %e)
+  ret void
+}
+
+; CHECK-LABEL: @g
+; CHECK-NOT: shr
+; CHECK-NOT: shl
+; CHECK: and
+; CHECK-NEXT: jmp
+
+declare void @f(i64)
+
diff --git a/test/CodeGen/X86/dagcombine_unsafe_math.ll b/test/CodeGen/X86/dagcombine-unsafe-math.ll
index 592cf1b..f06d9f1 100644
--- a/test/CodeGen/X86/dagcombine_unsafe_math.ll
+++ b/test/CodeGen/X86/dagcombine-unsafe-math.ll
@@ -43,7 +43,7 @@ define float @test4(float %x, float %y) {
 
 ; rdar://13445387
 ; "x + x + x => 3.0 * x" should be disabled after legalization because 
-; Instruction-Selection dosen't know how to handle "3.0"
+; Instruction-Selection doesn't know how to handle "3.0"
 ; 
 define float @test5() {
   %mul.i.i151 = fmul <4 x float> zeroinitializer, zeroinitializer
diff --git a/test/CodeGen/X86/dbg-at-specficiation.ll b/test/CodeGen/X86/dbg-at-specficiation.ll
deleted file mode 100644
index a6eebcb..0000000
--- a/test/CodeGen/X86/dbg-at-specficiation.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc  < %s | FileCheck %s
-; Radar 10147769
-; Do not unnecessarily use AT_specification DIE.
-; CHECK-NOT: AT_specification
-
-@a = common global [10 x i32] zeroinitializer, align 16
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 720913, metadata !11, i32 12, metadata !"clang version 3.0 (trunk 140253)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, i32 0} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{i32 720948, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 1, metadata !7, i32 0, i32 1, [10 x i32]* @a, null} ; [ DW_TAG_variable ]
-!6 = metadata !{i32 720937, metadata !11} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 320, i64 32, i32 0, i32 0, metadata !8, metadata !9, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!8 = metadata !{i32 720932, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!9 = metadata !{metadata !10}
-!10 = metadata !{i32 720929, i64 0, i64 10}        ; [ DW_TAG_subrange_type ]
-!11 = metadata !{metadata !"x.c", metadata !"/private/tmp"}
diff --git a/test/CodeGen/X86/dbg-byval-parameter.ll b/test/CodeGen/X86/dbg-byval-parameter.ll
deleted file mode 100644
index ef9e03c..0000000
--- a/test/CodeGen/X86/dbg-byval-parameter.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; RUN: llc -march=x86 -asm-verbose < %s | grep DW_TAG_formal_parameter
-
-
-%struct.Pt = type { double, double }
-%struct.Rect = type { %struct.Pt, %struct.Pt }
-
-define double @foo(%struct.Rect* byval %my_r0) nounwind ssp {
-entry:
-  %retval = alloca double                         ; <double*> [#uses=2]
-  %0 = alloca double                              ; <double*> [#uses=2]
-  %"alloca point" = bitcast i32 0 to i32          ; <i32> [#uses=0]
-  call void @llvm.dbg.declare(metadata !{%struct.Rect* %my_r0}, metadata !0), !dbg !15
-  %1 = getelementptr inbounds %struct.Rect* %my_r0, i32 0, i32 0, !dbg !16 ; <%struct.Pt*> [#uses=1]
-  %2 = getelementptr inbounds %struct.Pt* %1, i32 0, i32 0, !dbg !16 ; <double*> [#uses=1]
-  %3 = load double* %2, align 8, !dbg !16         ; <double> [#uses=1]
-  store double %3, double* %0, align 8, !dbg !16
-  %4 = load double* %0, align 8, !dbg !16         ; <double> [#uses=1]
-  store double %4, double* %retval, align 8, !dbg !16
-  br label %return, !dbg !16
-
-return:                                           ; preds = %entry
-  %retval1 = load double* %retval, !dbg !16       ; <double> [#uses=1]
-  ret double %retval1, !dbg !16
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!3}
-
-!0 = metadata !{i32 786689, metadata !1, metadata !"my_r0", metadata !2, i32 11, metadata !7, i32 0, null} ; [ DW_TAG_arg_variable ]
-!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 11, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, double (%struct.Rect*)* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786449, metadata !19, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!5 = metadata !{metadata !6, metadata !7}
-!6 = metadata !{i32 786468, metadata !19, metadata !2, metadata !"double", i32 0, i64 64, i64 64, i64 0, i32 0, i32 4} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Rect", i32 6, i64 256, i64 64, i64 0, i32 0, null, metadata !8, i32 0, null} ; [ DW_TAG_structure_type ]
-!8 = metadata !{metadata !9, metadata !14}
-!9 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P1", i32 7, i64 128, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786451, metadata !19, metadata !2, metadata !"Pt", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_structure_type ]
-!11 = metadata !{metadata !12, metadata !13}
-!12 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"x", i32 2, i64 64, i64 64, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!13 = metadata !{i32 786445, metadata !19, metadata !10, metadata !"y", i32 3, i64 64, i64 64, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
-!14 = metadata !{i32 786445, metadata !19, metadata !7, metadata !"P2", i32 8, i64 128, i64 64, i64 128, i32 0, metadata !10} ; [ DW_TAG_member ]
-!15 = metadata !{i32 11, i32 0, metadata !1, null}
-!16 = metadata !{i32 12, i32 0, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !19, metadata !1, i32 11, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{metadata !1}
-!19 = metadata !{metadata !"b2.c", metadata !"/tmp/"}
-!20 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-const-int.ll b/test/CodeGen/X86/dbg-const-int.ll
deleted file mode 100644
index fc4ff6d..0000000
--- a/test/CodeGen/X86/dbg-const-int.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s - | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-macosx10.6.7"
-; Radar 9511391
-
-;CHECK:         .byte   4                       ## DW_AT_const_value
-define i32 @foo() nounwind uwtable readnone optsize ssp {
-entry:
-  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
-  ret i32 42, !dbg !10
-}
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 786449, metadata !13, i32 12, metadata !"clang version 3.0 (trunk 132191)", i1 true, metadata !"", i32 0, metadata !14, metadata !14, metadata !11, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !13, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 true, i32 ()* @foo, null, null, metadata !12, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !13, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786688, metadata !7, metadata !"i", metadata !2, i32 2, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!7 = metadata !{i32 786443, metadata !13, metadata !1, i32 1, i32 11, i32 0} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{i32 42}
-!9 = metadata !{i32 2, i32 12, metadata !7, null}
-!10 = metadata !{i32 3, i32 2, metadata !7, null}
-!11 = metadata !{metadata !1}
-!12 = metadata !{metadata !6}
-!13 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
-!14 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-const.ll b/test/CodeGen/X86/dbg-const.ll
deleted file mode 100644
index b37eb0a..0000000
--- a/test/CodeGen/X86/dbg-const.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc < %s - | FileCheck %s
-;
-; FIXME: A potentially more interesting test case would be:
-; %call = @bar()
-; dbg.value j=0
-; %call2 = @bar()
-; dbg.value j=%call
-;
-; We cannot current handle the above sequence because codegenprepare
-; hoists the second dbg.value above %call2, which then appears to
-; conflict with j=0. It does this because SelectionDAG cannot handle
-; global debug values.
-
-target triple = "x86_64-apple-darwin10.0.0"
-
-;CHECK:        ## DW_OP_constu
-;CHECK-NEXT:  .byte	42
-define i32 @foobar() nounwind readonly noinline ssp {
-entry:
-  tail call void @llvm.dbg.value(metadata !8, i64 0, metadata !6), !dbg !9
-  %call = tail call i32 @bar(), !dbg !11
-  tail call void @llvm.dbg.value(metadata !{i32 %call}, i64 0, metadata !6), !dbg !11
-  %call2 = tail call i32 @bar(), !dbg !11
-  %add = add nsw i32 %call2, %call, !dbg !12
-  ret i32 %add, !dbg !10
-}
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-declare i32 @bar() nounwind readnone
-
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !15, metadata !1, metadata !"foobar", metadata !"foobar", metadata !"foobar", i32 12, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, i32 ()* @foobar, null, null, metadata !14, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !15} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !15, i32 12, metadata !"clang version 2.9 (trunk 114183)", i1 true, metadata !"", i32 0, metadata !16, metadata !16, metadata !13, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !15, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null}
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, metadata !15, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
-!6 = metadata !{i32 786688, metadata !7, metadata !"j", metadata !1, i32 15, metadata !5, i32 0, null}
-!7 = metadata !{i32 786443, metadata !15, metadata !0, i32 12, i32 52, i32 0} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{i32 42}
-!9 = metadata !{i32 15, i32 12, metadata !7, null}
-!10 = metadata !{i32 23, i32 3, metadata !7, null}
-!11 = metadata !{i32 17, i32 3, metadata !7, null}
-!12 = metadata !{i32 18, i32 3, metadata !7, null}
-!13 = metadata !{metadata !0}
-!14 = metadata !{metadata !6}
-!15 = metadata !{metadata !"mu.c", metadata !"/private/tmp"}
-!16 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-declare-arg.ll b/test/CodeGen/X86/dbg-declare-arg.ll
deleted file mode 100644
index 55b4238..0000000
--- a/test/CodeGen/X86/dbg-declare-arg.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; RUN: llc -O0 -fast-isel=false < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-macosx10.6.7"
-;Radar 9321650
-
-;CHECK: ##DEBUG_VALUE: my_a 
-
-%class.A = type { i32, i32, i32, i32 }
-
-define void @_Z3fooi(%class.A* sret %agg.result, i32 %i) ssp {
-entry:
-  %i.addr = alloca i32, align 4
-  %j = alloca i32, align 4
-  %nrvo = alloca i1
-  %cleanup.dest.slot = alloca i32
-  store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !26), !dbg !27
-  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !28), !dbg !30
-  store i32 0, i32* %j, align 4, !dbg !31
-  %tmp = load i32* %i.addr, align 4, !dbg !32
-  %cmp = icmp eq i32 %tmp, 42, !dbg !32
-  br i1 %cmp, label %if.then, label %if.end, !dbg !32
-
-if.then:                                          ; preds = %entry
-  %tmp1 = load i32* %i.addr, align 4, !dbg !33
-  %add = add nsw i32 %tmp1, 1, !dbg !33
-  store i32 %add, i32* %j, align 4, !dbg !33
-  br label %if.end, !dbg !35
-
-if.end:                                           ; preds = %if.then, %entry
-  store i1 false, i1* %nrvo, !dbg !36
-  call void @llvm.dbg.declare(metadata !{%class.A* %agg.result}, metadata !37), !dbg !39
-  %tmp2 = load i32* %j, align 4, !dbg !40
-  %x = getelementptr inbounds %class.A* %agg.result, i32 0, i32 0, !dbg !40
-  store i32 %tmp2, i32* %x, align 4, !dbg !40
-  store i1 true, i1* %nrvo, !dbg !41
-  store i32 1, i32* %cleanup.dest.slot
-  %nrvo.val = load i1* %nrvo, !dbg !42
-  br i1 %nrvo.val, label %nrvo.skipdtor, label %nrvo.unused, !dbg !42
-
-nrvo.unused:                                      ; preds = %if.end
-  call void @_ZN1AD1Ev(%class.A* %agg.result), !dbg !42
-  br label %nrvo.skipdtor, !dbg !42
-
-nrvo.skipdtor:                                    ; preds = %nrvo.unused, %if.end
-  ret void, !dbg !42
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-define linkonce_odr void @_ZN1AD1Ev(%class.A* %this) unnamed_addr ssp align 2 {
-entry:
-  %this.addr = alloca %class.A*, align 8
-  store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !43), !dbg !44
-  %this1 = load %class.A** %this.addr
-  call void @_ZN1AD2Ev(%class.A* %this1)
-  ret void, !dbg !45
-}
-
-define linkonce_odr void @_ZN1AD2Ev(%class.A* %this) unnamed_addr nounwind ssp align 2 {
-entry:
-  %this.addr = alloca %class.A*, align 8
-  store %class.A* %this, %class.A** %this.addr, align 8
-  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !46), !dbg !47
-  %this1 = load %class.A** %this.addr
-  %x = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !48
-  store i32 1, i32* %x, align 4, !dbg !48
-  ret void, !dbg !48
-}
-
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"~A", metadata !"~A", metadata !"", i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 589826, metadata !51, metadata !2, metadata !"A", i32 2, i64 128, i64 32, i32 0, i32 0, null, metadata !4, i32 0, null, null} ; [ DW_TAG_class_type ]
-!2 = metadata !{i32 786449, metadata !51, i32 4, metadata !"clang version 3.0 (trunk 130127)", i1 false, metadata !"", i32 0, metadata !24, metadata !24, metadata !50, null, null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786473, metadata !51} ; [ DW_TAG_file_type ]
-!4 = metadata !{metadata !5, metadata !7, metadata !8, metadata !9, metadata !0, metadata !10, metadata !14}
-!5 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"x", i32 2, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_member ]
-!6 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"y", i32 2, i64 32, i64 32, i64 32, i32 0, metadata !6} ; [ DW_TAG_member ]
-!8 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"z", i32 2, i64 32, i64 32, i64 64, i32 0, metadata !6} ; [ DW_TAG_member ]
-!9 = metadata !{i32 786445, metadata !51, metadata !3, metadata !"o", i32 2, i64 32, i64 32, i64 96, i32 0, metadata !6} ; [ DW_TAG_member ]
-!10 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"A", metadata !"A", metadata !"", i32 2, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!11 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!12 = metadata !{null, metadata !13}
-!13 = metadata !{i32 786447, metadata !2, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !1} ; [ DW_TAG_pointer_type ]
-!14 = metadata !{i32 786478, metadata !51, metadata !1, metadata !"A", metadata !"A", metadata !"", i32 2, metadata !15, i1 false, i1 false, i32 0, i32 0, null, i32 320, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!15 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !16, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!16 = metadata !{null, metadata !13, metadata !17}
-!17 = metadata !{i32 589840, null, metadata !2, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !18} ; [ DW_TAG_reference_type ]
-!18 = metadata !{i32 786470, metadata !2, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !1} ; [ DW_TAG_const_type ]
-!19 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"foo", metadata !"foo", metadata !"_Z3fooi", i32 4, metadata !20, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%class.A*, i32)* @_Z3fooi, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!20 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !21, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!21 = metadata !{metadata !1}
-!22 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"~A", metadata !"~A", metadata !"_ZN1AD1Ev", i32 2, metadata !23, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%class.A*)* @_ZN1AD1Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!23 = metadata !{i32 786453, metadata !51, metadata !3, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !24, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!24 = metadata !{null}
-!25 = metadata !{i32 786478, metadata !51, metadata !3, metadata !"~A", metadata !"~A", metadata !"_ZN1AD2Ev", i32 2, metadata !23, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void (%class.A*)* @_ZN1AD2Ev, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!26 = metadata !{i32 786689, metadata !19, metadata !"i", metadata !3, i32 16777220, metadata !6, i32 0, null} ; [ DW_TAG_arg_variable ]
-!27 = metadata !{i32 4, i32 11, metadata !19, null}
-!28 = metadata !{i32 786688, metadata !29, metadata !"j", metadata !3, i32 5, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!29 = metadata !{i32 786443, metadata !51, metadata !19, i32 4, i32 14, i32 0} ; [ DW_TAG_lexical_block ]
-!30 = metadata !{i32 5, i32 7, metadata !29, null}
-!31 = metadata !{i32 5, i32 12, metadata !29, null}
-!32 = metadata !{i32 6, i32 3, metadata !29, null}
-!33 = metadata !{i32 7, i32 5, metadata !34, null}
-!34 = metadata !{i32 786443, metadata !51, metadata !29, i32 6, i32 16, i32 1} ; [ DW_TAG_lexical_block ]
-!35 = metadata !{i32 8, i32 3, metadata !34, null}
-!36 = metadata !{i32 9, i32 9, metadata !29, null}
-!37 = metadata !{i32 786688, metadata !29, metadata !"my_a", metadata !3, i32 9, metadata !38, i32 0, null} ; [ DW_TAG_auto_variable ]
-!38 = metadata !{i32 589840, metadata !2, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !1} ; [ DW_TAG_reference_type ]
-!39 = metadata !{i32 9, i32 5, metadata !29, null}
-!40 = metadata !{i32 10, i32 3, metadata !29, null}
-!41 = metadata !{i32 11, i32 3, metadata !29, null}
-!42 = metadata !{i32 12, i32 1, metadata !29, null}
-!43 = metadata !{i32 786689, metadata !22, metadata !"this", metadata !3, i32 16777218, metadata !13, i32 64, null} ; [ DW_TAG_arg_variable ]
-!44 = metadata !{i32 2, i32 47, metadata !22, null}
-!45 = metadata !{i32 2, i32 61, metadata !22, null}
-!46 = metadata !{i32 786689, metadata !25, metadata !"this", metadata !3, i32 16777218, metadata !13, i32 64, null} ; [ DW_TAG_arg_variable ]
-!47 = metadata !{i32 2, i32 47, metadata !25, null}
-!48 = metadata !{i32 2, i32 54, metadata !49, null}
-!49 = metadata !{i32 786443, metadata !51, metadata !25, i32 2, i32 52, i32 2} ; [ DW_TAG_lexical_block ]
-!50 = metadata !{metadata !0, metadata !10, metadata !14, metadata !19, metadata !22, metadata !25}
-!51 = metadata !{metadata !"a.cc", metadata !"/private/tmp"}
diff --git a/test/CodeGen/X86/dbg-declare.ll b/test/CodeGen/X86/dbg-declare.ll
deleted file mode 100644
index d74e270..0000000
--- a/test/CodeGen/X86/dbg-declare.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -O0 -mtriple x86_64-apple-darwin
-; <rdar://problem/11134152>
-
-define i32 @foo(i32* %x) nounwind uwtable ssp {
-entry:
-  %x.addr = alloca i32*, align 8
-  %saved_stack = alloca i8*
-  %cleanup.dest.slot = alloca i32
-  store i32* %x, i32** %x.addr, align 8
-  call void @llvm.dbg.declare(metadata !{i32** %x.addr}, metadata !14), !dbg !15
-  %0 = load i32** %x.addr, align 8, !dbg !16
-  %1 = load i32* %0, align 4, !dbg !16
-  %2 = zext i32 %1 to i64, !dbg !16
-  %3 = call i8* @llvm.stacksave(), !dbg !16
-  store i8* %3, i8** %saved_stack, !dbg !16
-  %vla = alloca i8, i64 %2, align 16, !dbg !16
-  call void @llvm.dbg.declare(metadata !{i8* %vla}, metadata !18), !dbg !23
-  store i32 1, i32* %cleanup.dest.slot
-  %4 = load i8** %saved_stack, !dbg !24
-  call void @llvm.stackrestore(i8* %4), !dbg !24
-  ret i32 0, !dbg !25
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-declare i8* @llvm.stacksave() nounwind
-
-declare void @llvm.stackrestore(i8*) nounwind
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 786449, metadata !26, i32 12, metadata !"clang version 3.1 (trunk 153698)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, metadata !26, metadata !0, metadata !"foo", metadata !"foo", metadata !"", i32 6, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32*)* @foo, null, null, metadata !12, i32 0} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!8 = metadata !{metadata !9, metadata !10}
-!9 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
-!11 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !9} ; [ DW_TAG_const_type ]
-!12 = metadata !{metadata !13}
-!13 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !6, i32 16777221, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 5, i32 21, metadata !5, null}
-!16 = metadata !{i32 7, i32 13, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !26, metadata !5, i32 6, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!18 = metadata !{i32 786688, metadata !17, metadata !"a", metadata !6, i32 7, metadata !19, i32 0, i32 0} ; [ DW_TAG_auto_variable ]
-!19 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 0, i64 8, i32 0, i32 0, metadata !20, metadata !21, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!20 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!21 = metadata !{metadata !22}
-!22 = metadata !{i32 786465, i64 0, i64 -1}        ; [ DW_TAG_subrange_type ]
-!23 = metadata !{i32 7, i32 8, metadata !17, null}
-!24 = metadata !{i32 9, i32 1, metadata !17, null}
-!25 = metadata !{i32 8, i32 3, metadata !17, null}
-!26 = metadata !{metadata !"20020104-2.c", metadata !"/Volumes/Sandbox/llvm"}
diff --git a/test/CodeGen/X86/dbg-file-name.ll b/test/CodeGen/X86/dbg-file-name.ll
deleted file mode 100644
index 797b4b5..0000000
--- a/test/CodeGen/X86/dbg-file-name.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc -enable-dwarf-directory -mtriple x86_64-apple-darwin10.0.0  < %s | FileCheck %s
-
-; Radar 8884898
-; CHECK: file	1 "simple.c"
-
-declare i32 @printf(i8*, ...) nounwind
-
-define i32 @main() nounwind {
-  ret i32 0
-}
-
-!llvm.dbg.cu = !{!2}
-
-!1 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !10, i32 1, metadata !"LLVM build 00", i1 true, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!5 = metadata !{i32 786468, metadata !10, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !10, metadata !1, metadata !"main", metadata !"main", metadata !"main", i32 9, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @main, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786453, metadata !10, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!8 = metadata !{metadata !5}
-!9 = metadata !{metadata !6}
-!10 = metadata !{metadata !"simple.c", metadata !"/Users/manav/one/two"}
-!11 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-i128-const.ll b/test/CodeGen/X86/dbg-i128-const.ll
deleted file mode 100644
index f413909..0000000
--- a/test/CodeGen/X86/dbg-i128-const.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
-
-; CHECK: DW_AT_const_value
-; CHECK-NEXT: 42
-
-define i128 @__foo(i128 %a, i128 %b) nounwind {
-entry:
-  tail call void @llvm.dbg.value(metadata !0, i64 0, metadata !1), !dbg !11
-  %add = add i128 %a, %b, !dbg !11
-  ret i128 %add, !dbg !11
-}
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!5}
-
-!0 = metadata !{i128 42 }
-!1 = metadata !{i32 786688, metadata !2, metadata !"MAX", metadata !4, i32 29, metadata !8, i32 0, null} ; [ DW_TAG_auto_variable ]
-!2 = metadata !{i32 786443, metadata !13, metadata !3, i32 26, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!3 = metadata !{i32 786478, metadata !13, metadata !4, metadata !"__foo", metadata !"__foo", metadata !"__foo", i32 26, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i32 0, i128 (i128, i128)* @__foo, null, null, null, i32 26} ; [ DW_TAG_subprogram ]
-!4 = metadata !{i32 786473, metadata !13} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 786449, metadata !13, i32 1, metadata !"clang", i1 true, metadata !"", i32 0, metadata !15, metadata !15, metadata !12, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!6 = metadata !{i32 786453, metadata !13, metadata !4, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!7 = metadata !{metadata !8, metadata !8, metadata !8}
-!8 = metadata !{i32 786454, metadata !14, metadata !4, metadata !"ti_int", i32 78, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_typedef ]
-!9 = metadata !{i32 786473, metadata !14} ; [ DW_TAG_file_type ]
-!10 = metadata !{i32 786468, metadata !13, metadata !4, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 29, i32 0, metadata !2, null}
-!12 = metadata !{metadata !3}
-!13 = metadata !{metadata !"foo.c", metadata !"/tmp"}
-!14 = metadata !{metadata !"myint.h", metadata !"/tmp"}
-!15 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-large-unsigned-const.ll b/test/CodeGen/X86/dbg-large-unsigned-const.ll
deleted file mode 100644
index c5cbf06..0000000
--- a/test/CodeGen/X86/dbg-large-unsigned-const.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: llc -filetype=obj %s -o /dev/null
-; Hanle large unsigned constant values.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-target triple = "i386-apple-macosx10.7.0"
-
-define zeroext i1 @_Z3iseRKxS0_(i64* nocapture %LHS, i64* nocapture %RHS) nounwind readonly optsize ssp {
-entry:
-  tail call void @llvm.dbg.value(metadata !{i64* %LHS}, i64 0, metadata !7), !dbg !13
-  tail call void @llvm.dbg.value(metadata !{i64* %RHS}, i64 0, metadata !11), !dbg !14
-  %tmp1 = load i64* %LHS, align 4, !dbg !15
-  %tmp3 = load i64* %RHS, align 4, !dbg !15
-  %cmp = icmp eq i64 %tmp1, %tmp3, !dbg !15
-  ret i1 %cmp, !dbg !15
-}
-
-define zeroext i1 @_Z2fnx(i64 %a) nounwind readnone optsize ssp {
-entry:
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !12), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !12), !dbg !20
-  tail call void @llvm.dbg.value(metadata !{i64 %a}, i64 0, metadata !21), !dbg !24
-  tail call void @llvm.dbg.value(metadata !25, i64 0, metadata !26), !dbg !27
-  %cmp.i = icmp eq i64 %a, 9223372036854775807, !dbg !28
-  ret i1 %cmp.i, !dbg !22
-}
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!0}
-!29 = metadata !{metadata !1, metadata !6}
-!30 = metadata !{metadata !7, metadata !11}
-!31 = metadata !{metadata !12}
-
-!0 = metadata !{i32 786449, metadata !32, i32 4, metadata !"clang version 3.0 (trunk 135593)", i1 true, metadata !"", i32 0, metadata !33, metadata !33, metadata !29, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !32, null, metadata !"ise", metadata !"ise", metadata !"_Z3iseRKxS0_", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i1 (i64*, i64*)* @_Z3iseRKxS0_, null, null, metadata !30, i32 2} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !32} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !32, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"bool", i32 0, i64 8, i64 8, i64 0, i32 0, i32 2} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !32, null, metadata !"fn", metadata !"fn", metadata !"_Z2fnx", i32 6, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i1 (i64)* @_Z2fnx, null, null, metadata !31, i32 6} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786689, metadata !1, metadata !"LHS", metadata !2, i32 16777218, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786448, metadata !0, null, null, i32 0, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_reference_type ]
-!9 = metadata !{i32 786470, metadata !0, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_const_type ]
-!10 = metadata !{i32 786468, null, metadata !0, metadata !"long long int", i32 0, i64 64, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!11 = metadata !{i32 786689, metadata !1, metadata !"RHS", metadata !2, i32 33554434, metadata !8, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!12 = metadata !{i32 786689, metadata !6, metadata !"a", metadata !2, i32 16777222, metadata !10, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 2, i32 27, metadata !1, null}
-!14 = metadata !{i32 2, i32 49, metadata !1, null}
-!15 = metadata !{i32 3, i32 3, metadata !16, null}
-!16 = metadata !{i32 786443, metadata !32, metadata !1, i32 2, i32 54, i32 0} ; [ DW_TAG_lexical_block ]
-!20 = metadata !{i32 6, i32 19, metadata !6, null}
-!21 = metadata !{i32 786689, metadata !1, metadata !"LHS", metadata !2, i32 16777218, metadata !8, i32 0, metadata !22} ; [ DW_TAG_arg_variable ]
-!22 = metadata !{i32 7, i32 10, metadata !23, null}
-!23 = metadata !{i32 786443, metadata !32, metadata !6, i32 6, i32 22, i32 1} ; [ DW_TAG_lexical_block ]
-!24 = metadata !{i32 2, i32 27, metadata !1, metadata !22}
-!25 = metadata !{i64 9223372036854775807}         
-!26 = metadata !{i32 786689, metadata !1, metadata !"RHS", metadata !2, i32 33554434, metadata !8, i32 0, metadata !22} ; [ DW_TAG_arg_variable ]
-!27 = metadata !{i32 2, i32 49, metadata !1, metadata !22}
-!28 = metadata !{i32 3, i32 3, metadata !16, metadata !22}
-!32 = metadata !{metadata !"lli.cc", metadata !"/private/tmp"}
-!33 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-merge-loc-entry.ll b/test/CodeGen/X86/dbg-merge-loc-entry.ll
deleted file mode 100644
index ccf4808..0000000
--- a/test/CodeGen/X86/dbg-merge-loc-entry.ll
+++ /dev/null
@@ -1,73 +0,0 @@
-; RUN: llc < %s -o %t -filetype=obj
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
-; RUN: llc < %s -o %t -filetype=obj -regalloc=basic
-; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin8"
-
-;CHECK: DW_AT_location{{.*}}(<0x01> 55 )
-
-%0 = type { i64, i1 }
-
-@__clz_tab = external constant [256 x i8]
-
-define hidden i128 @__divti3(i128 %u, i128 %v) nounwind readnone {
-entry:
-  tail call void @llvm.dbg.value(metadata !{i128 %u}, i64 0, metadata !14), !dbg !15
-  tail call void @llvm.dbg.value(metadata !16, i64 0, metadata !17), !dbg !21
-  br i1 undef, label %bb2, label %bb4, !dbg !22
-
-bb2:                                              ; preds = %entry
-  br label %bb4, !dbg !23
-
-bb4:                                              ; preds = %bb2, %entry
-  br i1 undef, label %__udivmodti4.exit, label %bb82.i, !dbg !24
-
-bb82.i:                                           ; preds = %bb4
-  unreachable
-
-__udivmodti4.exit:                                ; preds = %bb4
-  ret i128 undef, !dbg !27
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-declare %0 @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
-
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !29, metadata !1, metadata !"__udivmodti4", metadata !"__udivmodti4", metadata !"", i32 879, metadata !3, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, null, null, i32 879} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !29} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !29, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !31, metadata !31, metadata !28, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !29, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5, metadata !5, metadata !5, metadata !8}
-!5 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"UTItype", i32 166, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
-!6 = metadata !{i32 786473, metadata !30} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786447, metadata !29, metadata !1, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !5} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{i32 786478, metadata !29, metadata !1, metadata !"__divti3", metadata !"__divti3", metadata !"__divti3", i32 1094, metadata !10, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i128 (i128, i128)* @__divti3, null, null, null, i32 1094} ; [ DW_TAG_subprogram ]
-!10 = metadata !{i32 786453, metadata !29, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !11, i32 0, null} ; [ DW_TAG_subroutine_type ]
-!11 = metadata !{metadata !12, metadata !12, metadata !12}
-!12 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"TItype", i32 160, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_typedef ]
-!13 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"", i32 0, i64 128, i64 128, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!14 = metadata !{i32 786689, metadata !9, metadata !"u", metadata !1, i32 1093, metadata !12, i32 0, null} ; [ DW_TAG_arg_variable ]
-!15 = metadata !{i32 1093, i32 0, metadata !9, null}
-!16 = metadata !{i64 0}
-!17 = metadata !{i32 786688, metadata !18, metadata !"c", metadata !1, i32 1095, metadata !19, i32 0, null} ; [ DW_TAG_auto_variable ]
-!18 = metadata !{i32 786443, metadata !29, metadata !9, i32 1094, i32 0, i32 13} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{i32 786454, metadata !30, metadata !6, metadata !"word_type", i32 424, i64 0, i64 0, i64 0, i32 0, metadata !20} ; [ DW_TAG_typedef ]
-!20 = metadata !{i32 786468, metadata !29, metadata !1, metadata !"long int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!21 = metadata !{i32 1095, i32 0, metadata !18, null}
-!22 = metadata !{i32 1103, i32 0, metadata !18, null}
-!23 = metadata !{i32 1104, i32 0, metadata !18, null}
-!24 = metadata !{i32 1003, i32 0, metadata !25, metadata !26}
-!25 = metadata !{i32 786443, metadata !29, metadata !0, i32 879, i32 0, i32 0} ; [ DW_TAG_lexical_block ]
-!26 = metadata !{i32 1107, i32 0, metadata !18, null}
-!27 = metadata !{i32 1111, i32 0, metadata !18, null}
-!28 = metadata !{metadata !0, metadata !9}
-!29 = metadata !{metadata !"foobar.c", metadata !"/tmp"}
-!30 = metadata !{metadata !"foobar.h", metadata !"/tmp"}
-!31 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-prolog-end.ll b/test/CodeGen/X86/dbg-prolog-end.ll
deleted file mode 100644
index c8d8499..0000000
--- a/test/CodeGen/X86/dbg-prolog-end.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc -O0 < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-macosx10.6.7"
-
-;CHECK: .loc	1 2 11 prologue_end
-define i32 @foo(i32 %i) nounwind ssp {
-entry:
-  %i.addr = alloca i32, align 4
-  %j = alloca i32, align 4
-  store i32 %i, i32* %i.addr, align 4
-  call void @llvm.dbg.declare(metadata !{i32* %i.addr}, metadata !7), !dbg !8
-  call void @llvm.dbg.declare(metadata !{i32* %j}, metadata !9), !dbg !11
-  store i32 2, i32* %j, align 4, !dbg !12
-  %tmp = load i32* %j, align 4, !dbg !13
-  %inc = add nsw i32 %tmp, 1, !dbg !13
-  store i32 %inc, i32* %j, align 4, !dbg !13
-  %tmp1 = load i32* %j, align 4, !dbg !14
-  %tmp2 = load i32* %i.addr, align 4, !dbg !14
-  %add = add nsw i32 %tmp1, %tmp2, !dbg !14
-  store i32 %add, i32* %j, align 4, !dbg !14
-  %tmp3 = load i32* %j, align 4, !dbg !15
-  ret i32 %tmp3, !dbg !15
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-define i32 @main() nounwind ssp {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  %call = call i32 @foo(i32 21), !dbg !16
-  ret i32 %call, !dbg !16
-}
-
-!llvm.dbg.cu = !{!0}
-!18 = metadata !{metadata !1, metadata !6}
-
-!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.0 (trunk 131100)", i1 false, metadata !"", i32 0, metadata !20, metadata !20, metadata !18, null,  null, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"foo", metadata !"foo", metadata !"", i32 1, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, i32 (i32)* @foo, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !19, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !19, metadata !2, metadata !"main", metadata !"main", metadata !"", i32 7, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, i32 ()* @main, null, null, null, i32 7} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777217, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 1, i32 13, metadata !1, null}
-!9 = metadata !{i32 786688, metadata !10, metadata !"j", metadata !2, i32 2, metadata !5, i32 0, null} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{i32 786443, metadata !19, metadata !1, i32 1, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 2, i32 6, metadata !10, null}
-!12 = metadata !{i32 2, i32 11, metadata !10, null}
-!13 = metadata !{i32 3, i32 2, metadata !10, null}
-!14 = metadata !{i32 4, i32 2, metadata !10, null}
-!15 = metadata !{i32 5, i32 2, metadata !10, null}
-!16 = metadata !{i32 8, i32 2, metadata !17, null}
-!17 = metadata !{i32 786443, metadata !19, metadata !6, i32 7, i32 12, i32 1} ; [ DW_TAG_lexical_block ]
-!19 = metadata !{metadata !"/tmp/a.c", metadata !"/private/tmp"}
-!20 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-subrange.ll b/test/CodeGen/X86/dbg-subrange.ll
deleted file mode 100644
index ffb5f2d..0000000
--- a/test/CodeGen/X86/dbg-subrange.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc -O0 < %s | FileCheck %s
-; Radar 10464995
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.7.2"
-
-@s = common global [4294967296 x i8] zeroinitializer, align 16
-;CHECK: .long	4294967295
-
-define void @bar() nounwind uwtable ssp {
-entry:
-  store i8 97, i8* getelementptr inbounds ([4294967296 x i8]* @s, i32 0, i64 0), align 1, !dbg !18
-  ret void, !dbg !20
-}
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 786449, metadata !21, i32 12, metadata !"clang version 3.1 (trunk 144833)", i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !11,  metadata !11, metadata !""} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 0}
-!3 = metadata !{metadata !5}
-!5 = metadata !{i32 720942, metadata !21, metadata !6, metadata !"bar", metadata !"bar", metadata !"", i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void ()* @bar, null, null, metadata !9, i32 0} ; [ DW_TAG_subprogram ]
-!6 = metadata !{i32 720937, metadata !21} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!8 = metadata !{null}
-!9 = metadata !{metadata !10}
-!10 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
-!11 = metadata !{metadata !13}
-!13 = metadata !{i32 720948, i32 0, null, metadata !"s", metadata !"s", metadata !"", metadata !6, i32 2, metadata !14, i32 0, i32 1, [4294967296 x i8]* @s, null} ; [ DW_TAG_variable ]
-!14 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 34359738368, i64 8, i32 0, i32 0, metadata !15, metadata !16, i32 0, i32 0} ; [ DW_TAG_array_type ]
-!15 = metadata !{i32 720932, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!16 = metadata !{metadata !17}
-!17 = metadata !{i32 720929, i64 0, i64 4294967296} ; [ DW_TAG_subrange_type ]
-!18 = metadata !{i32 5, i32 3, metadata !19, null}
-!19 = metadata !{i32 786443, metadata !21, metadata !5, i32 4, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!20 = metadata !{i32 6, i32 1, metadata !19, null}
-!21 = metadata !{metadata !"small.c", metadata !"/private/tmp"}
diff --git a/test/CodeGen/X86/dbg-value-dag-combine.ll b/test/CodeGen/X86/dbg-value-dag-combine.ll
deleted file mode 100644
index e281493..0000000
--- a/test/CodeGen/X86/dbg-value-dag-combine.ll
+++ /dev/null
@@ -1,47 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.0.0"
-; PR 9817
-
-
-declare  <4 x i32> @__amdil_get_global_id_int()
-declare  void @llvm.dbg.value(metadata , i64 , metadata )
-define void @__OpenCL_test_kernel(i32 addrspace(1)* %ip) nounwind {
-entry:
-  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata
-!7), !dbg !8
-  %0 = call <4 x i32> @__amdil_get_global_id_int() nounwind
-  %1 = extractelement <4 x i32> %0, i32 0
-  call void @llvm.dbg.value(metadata !{i32 %1}, i64 0, metadata !9), !dbg !11
-  call void @llvm.dbg.value(metadata !12, i64 0, metadata !13), !dbg !14
-  %tmp2 = load i32 addrspace(1)* %ip, align 4, !dbg !15
-  %tmp3 = add i32 0, %tmp2, !dbg !15
-; CHECK:  ##DEBUG_VALUE: idx <- EAX{{$}}
-  call void @llvm.dbg.value(metadata !{i32 %tmp3}, i64 0, metadata !13), !dbg
-!15
-  %arrayidx = getelementptr i32 addrspace(1)* %ip, i32 %1, !dbg !16
-  store i32 %tmp3, i32 addrspace(1)* %arrayidx, align 4, !dbg !16
-  ret void, !dbg !17
-}
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !19, metadata !1, metadata !"__OpenCL_test_kernel", metadata !"__OpenCL_test_kernel", metadata !"__OpenCL_test_kernel", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !19, i32 1, metadata !"clc", i1 false, metadata !"", i32 0, metadata !12, metadata !12, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !19, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{null, metadata !5}
-!5 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
-!6 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!7 = metadata !{i32 786689, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 1, i32 42, metadata !0, null}
-!9 = metadata !{i32 786688, metadata !10, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!10 = metadata !{i32 786443, metadata !19, metadata !0, i32 2, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
-!11 = metadata !{i32 3, i32 41, metadata !10, null}
-!12 = metadata !{i32 0}
-!13 = metadata !{i32 786688, metadata !10, metadata !"idx", metadata !1, i32 4, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!14 = metadata !{i32 4, i32 20, metadata !10, null}
-!15 = metadata !{i32 5, i32 15, metadata !10, null}
-!16 = metadata !{i32 6, i32 18, metadata !10, null}
-!17 = metadata !{i32 7, i32 1, metadata !0, null}
-!18 = metadata !{metadata !0}
-!19 = metadata !{metadata !"OCL6368.tmp.cl", metadata !"E:\5CUsers\5Cmvillmow.AMD\5CAppData\5CLocal\5CTemp"}
diff --git a/test/CodeGen/X86/dbg-value-isel.ll b/test/CodeGen/X86/dbg-value-isel.ll
deleted file mode 100644
index 0013385..0000000
--- a/test/CodeGen/X86/dbg-value-isel.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.0.0"
-; PR 9879
-
-; CHECK: ##DEBUG_VALUE: tid <-
-%0 = type { i8*, i8*, i8*, i8*, i32 }
-
-@sgv = internal addrspace(2) constant [1 x i8] zeroinitializer
-@fgv = internal addrspace(2) constant [1 x i8] zeroinitializer
-@lvgv = internal constant [0 x i8*] zeroinitializer
-@llvm.global.annotations = appending global [1 x %0] [%0 { i8* bitcast (void (i32 addrspace(1)*)* @__OpenCL_nbt02_kernel to i8*), i8* bitcast ([1 x i8] addrspace(2)* @sgv to i8*), i8* bitcast ([1 x i8] addrspace(2)* @fgv to i8*), i8* bitcast ([0 x i8*]* @lvgv to i8*), i32 0 }], section "llvm.metadata"
-
-define void @__OpenCL_nbt02_kernel(i32 addrspace(1)* %ip) nounwind {
-entry:
-  call void @llvm.dbg.value(metadata !{i32 addrspace(1)* %ip}, i64 0, metadata !8), !dbg !9
-  %0 = call <4 x i32> @__amdil_get_local_id_int() nounwind
-  %1 = extractelement <4 x i32> %0, i32 0
-  br label %2
-
-; <label>:2                                       ; preds = %entry
-  %3 = phi i32 [ %1, %entry ]
-  br label %4
-
-; <label>:4                                       ; preds = %2
-  %5 = phi i32 [ %3, %2 ]
-  br label %get_local_id.exit
-
-get_local_id.exit:                                ; preds = %4
-  %6 = phi i32 [ %5, %4 ]
-  call void @llvm.dbg.value(metadata !{i32 %6}, i64 0, metadata !10), !dbg !12
-  %7 = call <4 x i32> @__amdil_get_global_id_int() nounwind, !dbg !12
-  %8 = extractelement <4 x i32> %7, i32 0, !dbg !12
-  br label %9
-
-; <label>:9                                       ; preds = %get_local_id.exit
-  %10 = phi i32 [ %8, %get_local_id.exit ]
-  br label %11
-
-; <label>:11                                      ; preds = %9
-  %12 = phi i32 [ %10, %9 ]
-  br label %get_global_id.exit
-
-get_global_id.exit:                               ; preds = %11
-  %13 = phi i32 [ %12, %11 ]
-  call void @llvm.dbg.value(metadata !{i32 %13}, i64 0, metadata !13), !dbg !14
-  %14 = call <4 x i32> @__amdil_get_local_size_int() nounwind
-  %15 = extractelement <4 x i32> %14, i32 0
-  br label %16
-
-; <label>:16                                      ; preds = %get_global_id.exit
-  %17 = phi i32 [ %15, %get_global_id.exit ]
-  br label %18
-
-; <label>:18                                      ; preds = %16
-  %19 = phi i32 [ %17, %16 ]
-  br label %get_local_size.exit
-
-get_local_size.exit:                              ; preds = %18
-  %20 = phi i32 [ %19, %18 ]
-  call void @llvm.dbg.value(metadata !{i32 %20}, i64 0, metadata !15), !dbg !16
-  %tmp5 = add i32 %6, %13, !dbg !17
-  %tmp7 = add i32 %tmp5, %20, !dbg !17
-  store i32 %tmp7, i32 addrspace(1)* %ip, align 4, !dbg !17
-  br label %return, !dbg !17
-
-return:                                           ; preds = %get_local_size.exit
-  ret void, !dbg !18
-}
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-declare <4 x i32> @__amdil_get_local_size_int() nounwind
-
-declare <4 x i32> @__amdil_get_local_id_int() nounwind
-
-declare <4 x i32> @__amdil_get_global_id_int() nounwind
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !20, metadata !1, metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", metadata !"__OpenCL_nbt02_kernel", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 0, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !20, i32 1, metadata !"clc", i1 false, metadata !"", i32 0, metadata !21, metadata !21, metadata !19, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !20, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{null, metadata !5}
-!5 = metadata !{i32 786447, null, metadata !2, metadata !"", i32 0, i64 32, i64 32, i64 0, i32 0, metadata !6} ; [ DW_TAG_pointer_type ]
-!6 = metadata !{i32 589846, metadata !20, metadata !2, metadata !"uint", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ]
-!7 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ]
-!8 = metadata !{i32 786689, metadata !0, metadata !"ip", metadata !1, i32 1, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!9 = metadata !{i32 1, i32 32, metadata !0, null}
-!10 = metadata !{i32 786688, metadata !11, metadata !"tid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !0, i32 2, i32 1, metadata !1, i32 1} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 5, i32 24, metadata !11, null}
-!13 = metadata !{i32 786688, metadata !11, metadata !"gid", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!14 = metadata !{i32 6, i32 25, metadata !11, null}
-!15 = metadata !{i32 786688, metadata !11, metadata !"lsz", metadata !1, i32 3, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
-!16 = metadata !{i32 7, i32 26, metadata !11, null}
-!17 = metadata !{i32 9, i32 24, metadata !11, null}
-!18 = metadata !{i32 10, i32 1, metadata !0, null}
-!19 = metadata !{metadata !0}
-!20 = metadata !{metadata !"OCLlLwTXZ.cl", metadata !"/tmp"}
-!21 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-value-location.ll b/test/CodeGen/X86/dbg-value-location.ll
deleted file mode 100644
index f896e58..0000000
--- a/test/CodeGen/X86/dbg-value-location.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-; RUN: llc < %s | FileCheck %s
-; RUN: llc < %s -regalloc=basic | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin10.0.0"
-;Radar 8950491
-
-;CHECK: .long Lset5
-;CHECK-NEXT:        ## DW_AT_decl_file
-;CHECK-NEXT:        ## DW_AT_decl_line
-;CHECK-NEXT:        ## DW_AT_type
-;CHECK-NEXT:        ## DW_AT_location
-
-@dfm = external global i32, align 4
-
-declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
-
-define i32 @foo(i32 %dev, i64 %cmd, i8* %data, i32 %data2) nounwind optsize ssp {
-entry:
-  call void @llvm.dbg.value(metadata !{i32 %dev}, i64 0, metadata !12), !dbg !13
-  %tmp.i = load i32* @dfm, align 4, !dbg !14
-  %cmp.i = icmp eq i32 %tmp.i, 0, !dbg !14
-  br i1 %cmp.i, label %if.else, label %if.end.i, !dbg !14
-
-if.end.i:                                         ; preds = %entry
-  switch i64 %cmd, label %if.then [
-    i64 2147772420, label %bb.i
-    i64 536897538, label %bb116.i
-  ], !dbg !22
-
-bb.i:                                             ; preds = %if.end.i
-  unreachable
-
-bb116.i:                                          ; preds = %if.end.i
-  unreachable
-
-if.then:                                          ; preds = %if.end.i
-  ret i32 undef, !dbg !23
-
-if.else:                                          ; preds = %entry
-  ret i32 0
-}
-
-declare hidden fastcc i32 @bar(i32, i32* nocapture) nounwind optsize ssp
-declare hidden fastcc i32 @bar2(i32) nounwind optsize ssp
-declare hidden fastcc i32 @bar3(i32) nounwind optsize ssp
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!2}
-
-!0 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 19510, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i64, i8*, i32)* @foo, null, null, null, i32 19510} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !26} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !27, i32 12, metadata !"clang version 2.9 (trunk 124753)", i1 true, metadata !"", i32 0, metadata !28, metadata !28, metadata !24, null,  null, null} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !26, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar3", metadata !"bar3", metadata !"", i32 14827, metadata !3, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @bar3, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!7 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar2", metadata !"bar2", metadata !"", i32 15397, metadata !3, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32)* @bar2, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!8 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"bar", metadata !"bar", metadata !"", i32 12382, metadata !9, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, i32 (i32, i32*)* @bar, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!9 = metadata !{i32 786453, metadata !26, metadata !1, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !10, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!10 = metadata !{metadata !11}
-!11 = metadata !{i32 786468, null, metadata !2, metadata !"unsigned char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 8} ; [ DW_TAG_base_type ]
-!12 = metadata !{i32 786689, metadata !0, metadata !"var", metadata !1, i32 19509, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!13 = metadata !{i32 19509, i32 20, metadata !0, null}
-!14 = metadata !{i32 18091, i32 2, metadata !15, metadata !17}
-!15 = metadata !{i32 786443, metadata !26, metadata !16, i32 18086, i32 1, i32 748} ; [ DW_TAG_lexical_block ]
-!16 = metadata !{i32 786478, metadata !26, metadata !1, metadata !"foo_bar", metadata !"foo_bar", metadata !"", i32 18086, metadata !3, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!17 = metadata !{i32 19514, i32 2, metadata !18, null}
-!18 = metadata !{i32 786443, metadata !26, metadata !0, i32 19510, i32 1, i32 99} ; [ DW_TAG_lexical_block ]
-!22 = metadata !{i32 18094, i32 2, metadata !15, metadata !17}
-!23 = metadata !{i32 19524, i32 1, metadata !18, null}
-!24 = metadata !{metadata !0, metadata !6, metadata !7, metadata !8}
-!25 = metadata !{i32 786473, metadata !27} ; [ DW_TAG_file_type ]
-!26 = metadata !{metadata !"/tmp/f.c", metadata !"/tmp"}
-!27 = metadata !{metadata !"f.i", metadata !"/tmp"}
-!28 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dbg-value-terminator.ll b/test/CodeGen/X86/dbg-value-terminator.ll
deleted file mode 100644
index e8d70de..0000000
--- a/test/CodeGen/X86/dbg-value-terminator.ll
+++ /dev/null
@@ -1,131 +0,0 @@
-; RUN: llc -mtriple=x86_64-apple-macosx < %s -verify-machineinstrs | FileCheck %s
-;
-; PR16143: MachineOperand::setIsKill(bool): Assertion
-;
-; verify-machineinstrs should ensure that DEBUG_VALUEs go before the
-; terminator.
-;
-; CHECK-LABEL: test:
-; CHECK: ##DEBUG_VALUE: i
-%a = type { i32, i32 }
-
-define hidden fastcc %a* @test() #1 {
-entry:
-  %0 = icmp eq %a* undef, null, !dbg !1
-  br i1 %0, label %"14", label %return, !dbg !1
-
-"14":                                             ; preds = %"8"
-  br i1 undef, label %"25", label %"21", !dbg !1
-
-"21":                                             ; preds = %"14"
-  br i1 undef, label %may_unswitch_on.exit, label %"6.i", !dbg !1
-
-"6.i":                                            ; preds = %"21"
-  br i1 undef, label %"10.i", label %may_unswitch_on.exit, !dbg !1
-
-"10.i":                                           ; preds = %"6.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"12.i", !dbg !1
-
-"12.i":                                           ; preds = %"10.i"
-  br i1 undef, label %"4.i.i", label %"3.i.i", !dbg !1
-
-"3.i.i":                                          ; preds = %"12.i"
-  br i1 undef, label %"4.i.i", label %VEC_edge_base_index.exit.i, !dbg !1
-
-"4.i.i":                                          ; preds = %"3.i.i", %"12.i"
-  unreachable, !dbg !1
-
-VEC_edge_base_index.exit.i:                       ; preds = %"3.i.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"16.i", !dbg !1
-
-"16.i":                                           ; preds = %VEC_edge_base_index.exit.i
-  br i1 undef, label %"4.i6.i", label %"3.i5.i", !dbg !1
-
-"3.i5.i":                                         ; preds = %"16.i"
-  br i1 undef, label %VEC_edge_base_index.exit7.i, label %"4.i6.i", !dbg !1
-
-"4.i6.i":                                         ; preds = %"3.i5.i", %"16.i"
-  unreachable, !dbg !1
-
-VEC_edge_base_index.exit7.i:                      ; preds = %"3.i5.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"21.i", !dbg !1
-
-"21.i":                                           ; preds = %VEC_edge_base_index.exit7.i
-  br i1 undef, label %may_unswitch_on.exit, label %"23.i", !dbg !1
-
-"23.i":                                           ; preds = %"21.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"26.i", !dbg !1
-
-"26.i":                                           ; preds = %"34.i", %"23.i"
-  %1 = icmp eq i32 undef, 9, !dbg !1
-  br i1 %1, label %"34.i", label %"28.i", !dbg !1
-
-"28.i":                                           ; preds = %"26.i"
-  unreachable
-
-"34.i":                                           ; preds = %"26.i"
-  br i1 undef, label %"26.i", label %"36.i", !dbg !1
-
-"36.i":                                           ; preds = %"34.i"
-  br i1 undef, label %"37.i", label %"38.i", !dbg !1
-
-"37.i":                                           ; preds = %"36.i"
-  br label %"38.i", !dbg !1
-
-"38.i":                                           ; preds = %"37.i", %"36.i"
-  br i1 undef, label %"39.i", label %"45.i", !dbg !1
-
-"39.i":                                           ; preds = %"38.i"
-  br i1 undef, label %"41.i", label %may_unswitch_on.exit, !dbg !1
-
-"41.i":                                           ; preds = %"39.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"42.i", !dbg !1
-
-"42.i":                                           ; preds = %"41.i"
-  br i1 undef, label %may_unswitch_on.exit, label %"44.i", !dbg !1
-
-"44.i":                                           ; preds = %"42.i"
-  %2 = load %a** undef, align 8, !dbg !1
-  %3 = bitcast %a* %2 to %a*, !dbg !1
-  call void @llvm.dbg.value(metadata !{%a* %3}, i64 0, metadata !6), !dbg !12
-  br label %may_unswitch_on.exit, !dbg !1
-
-"45.i":                                           ; preds = %"38.i"
-  unreachable
-
-may_unswitch_on.exit:                             ; preds = %"44.i", %"42.i", %"41.i", %"39.i", %"23.i", %"21.i", %VEC_edge_base_index.exit7.i, %VEC_edge_base_index.exit.i, %"10.i", %"6.i", %"21"
-  %4 = phi %a* [ %3, %"44.i" ], [ null, %"6.i" ], [ null, %"10.i" ], [ null, %VEC_edge_base_index.exit7.i ], [ null, %VEC_edge_base_index.exit.i ], [ null, %"21.i" ], [ null, %"23.i" ], [ null, %"39.i" ], [ null, %"42.i" ], [ null, %"41.i" ], [ null, %"21" ]
-  br label %return
-
-"25":                                             ; preds = %"14"
-  unreachable
-
-"return":
-  %result = phi %a* [ null, %entry ], [ %4, %may_unswitch_on.exit ]
-  ret %a* %result, !dbg !1
-}
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind uwtable }
-
-declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 786449, metadata !20, i32 12, metadata !"Apple clang version", i1 true, metadata !"", i32 0, metadata !21, metadata !21, metadata !18, null,  null, null} ; [ DW_TAG_compile_unit ]
-!1 = metadata !{i32 786478, metadata !20, metadata !2, metadata !"foo", metadata !"", metadata !"", i32 2, metadata !3, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 true, %a* ()* @test, null, null, metadata !19, i32 0} ; [ DW_TAG_subprogram ]
-!2 = metadata !{i32 786473, metadata !20} ; [ DW_TAG_file_type ]
-!3 = metadata !{i32 786453, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !4, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786468, null, metadata !0, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
-!6 = metadata !{i32 786689, metadata !1, metadata !"i", metadata !2, i32 16777218, metadata !5, i32 0, null} ; [ DW_TAG_arg_variable ]
-!7 = metadata !{i32 786689, metadata !1, metadata !"c", metadata !2, i32 33554434, metadata !8, i32 0, null} ; [ DW_TAG_arg_variable ]
-!8 = metadata !{i32 786447, null, metadata !0, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !9} ; [ DW_TAG_pointer_type ]
-!9 = metadata !{i32 786468, null, metadata !0, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
-!10 = metadata !{i32 786688, metadata !11, metadata !"a", metadata !2, i32 3, metadata !9, i32 0, null} ; [ DW_TAG_auto_variable ]
-!11 = metadata !{i32 786443, metadata !20, metadata !1, i32 2, i32 25, i32 0} ; [ DW_TAG_lexical_block ]
-!12 = metadata !{i32 2, i32 13, metadata !1, null}
-!18 = metadata !{metadata !1}
-!19 = metadata !{metadata !6, metadata !7, metadata !10}
-!20 = metadata !{metadata !"a.c", metadata !"/private/tmp"}
-!21 = metadata !{i32 0}
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index b746dec..3b4a868 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -5,6 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 !llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!5}
 
 !0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
 !2 = metadata !{i32 0}
@@ -15,3 +16,4 @@ target triple = "x86_64-unknown-linux-gnu"
 ;                        Dir  Mod Time   File Len   File Name
 ;                        ---- ---------- ---------- ---------------------------
 ; CHECK: file_names[  1]    0 0x00000000 0x00000000 empty.c
+!5 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/dyn_alloca_aligned.ll b/test/CodeGen/X86/dyn_alloca_aligned.ll
new file mode 100644
index 0000000..993f4d2
--- /dev/null
+++ b/test/CodeGen/X86/dyn_alloca_aligned.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+define i32 @A(i32 %Size) {
+; CHECK:  subq    %rcx, %rax
+; CHECK:  andq    $-128, %rax
+; CHECK:  movq    %rax, %rsp
+  %A = alloca i8, i32 %Size, align 128
+  %A_addr = ptrtoint i8* %A to i32
+  ret i32 %A_addr
+}
diff --git a/test/CodeGen/X86/fast-isel-mem.ll b/test/CodeGen/X86/fast-isel-mem.ll
index 7fcef03..cd2dc1d 100644
--- a/test/CodeGen/X86/fast-isel-mem.ll
+++ b/test/CodeGen/X86/fast-isel-mem.ll
@@ -40,7 +40,7 @@ entry:
 ; CHECK:	movl	L_LotsStuff$non_lazy_ptr, %ecx
 
 ; ATOM: _t:
-; ATOM:         movl    L_LotsStuff$non_lazy_ptr, %ecx
-; ATOM:         movl    $0, %eax
+; ATOM:         movl    L_LotsStuff$non_lazy_ptr, %e{{..}}
+; ATOM:         movl    $0, %e{{..}}
 
 }
diff --git a/test/CodeGen/X86/fastcc.ll b/test/CodeGen/X86/fastcc.ll
index 705ab7b..a362f8d 100644
--- a/test/CodeGen/X86/fastcc.ll
+++ b/test/CodeGen/X86/fastcc.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 -post-RA-scheduler=false | FileCheck %s
-; CHECK: movsd %xmm0, 8(%esp)
-; CHECK: xorl %ecx, %ecx
+; CHECK: movsd %xmm{{[0-9]}}, 8(%esp)
+; CHECK: xorl %eax, %eax
 
 @d = external global double		; <double*> [#uses=1]
 @c = external global double		; <double*> [#uses=1]
diff --git a/test/CodeGen/X86/fastisel-gep-promote-before-add.ll b/test/CodeGen/X86/fastisel-gep-promote-before-add.ll
new file mode 100644
index 0000000..f87a34c
--- /dev/null
+++ b/test/CodeGen/X86/fastisel-gep-promote-before-add.ll
@@ -0,0 +1,37 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=x86_64-apple-darwin %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %add = add i8 64, 64 ; 0x40 + 0x40
+  %0 = load i8** %ptr.addr, align 8
+
+  ; CHECK-LABEL: _gep_promotion:
+  ; CHECK: movzbl ({{.*}})
+  %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
+
+define zeroext i8 @gep_promotion_nonconst(i8 %i, i8* %ptr) nounwind uwtable ssp {
+entry:
+  %i.addr = alloca i8, align 4
+  %ptr.addr = alloca i8*, align 8
+  store i8 %i, i8* %i.addr, align 4
+  store i8* %ptr, i8** %ptr.addr, align 8
+  %0 = load i8* %i.addr, align 4
+  ; CHECK-LABEL: _gep_promotion_nonconst:
+  ; CHECK: movzbl ({{.*}})
+  %xor = xor i8 %0, -128   ; %0   ^ 0x80
+  %add = add i8 %xor, -127 ; %xor + 0x81
+  %1 = load i8** %ptr.addr, align 8
+
+  %arrayidx = getelementptr inbounds i8* %1, i8 %add
+
+  %2 = load i8* %arrayidx, align 1
+  ret i8 %2
+}
+
diff --git a/test/CodeGen/X86/floor-soft-float.ll b/test/CodeGen/X86/floor-soft-float.ll
index 8e7ee09..5644509 100644
--- a/test/CodeGen/X86/floor-soft-float.ll
+++ b/test/CodeGen/X86/floor-soft-float.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx -soft-float=0 | FileCheck %s --check-prefix=CHECK-HARD-FLOAT
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx -soft-float=1 | FileCheck %s --check-prefix=CHECK-SOFT-FLOAT
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.1,-avx -soft-float=0 | FileCheck %s --check-prefix=CHECK-HARD-FLOAT
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.1,-avx -soft-float=1 | FileCheck %s --check-prefix=CHECK-SOFT-FLOAT
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll
index 47100be..e85d8f7 100644
--- a/test/CodeGen/X86/fold-load-vec.ll
+++ b/test/CodeGen/X86/fold-load-vec.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 ; rdar://12721174
 ; We should not fold movss into pshufd since pshufd expects m128 while movss
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index 495acd9..dde0a2d 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -38,10 +38,10 @@ L:
 
   store i16 %A, i16* %Q
   ret i32 %D
-  
+
 ; CHECK-LABEL: test2:
 ; CHECK: 	movl	4(%esp), %eax
-; CHECK-NEXT:	movzwl	(%eax), %ecx
+; CHECK-NEXT:	movzwl	(%eax), %e{{..}}
 
 }
 
@@ -49,10 +49,10 @@ L:
 ; xor in exit block will be CSE'ed and load will be folded to xor in entry.
 define i1 @test3(i32* %P, i32* %Q) nounwind {
 ; CHECK-LABEL: test3:
-; CHECK: movl 8(%esp), %eax
-; CHECK: xorl (%eax),
+; CHECK: movl 8(%esp), %e
+; CHECK: movl 4(%esp), %e
+; CHECK: xorl (%e
 ; CHECK: j
-; CHECK-NOT: xor
 entry:
   %0 = load i32* %P, align 4
   %1 = load i32* %Q, align 4
diff --git a/test/CodeGen/X86/fold-pcmpeqd-2.ll b/test/CodeGen/X86/fold-pcmpeqd-2.ll
index 0a3afb7..60a6844 100644
--- a/test/CodeGen/X86/fold-pcmpeqd-2.ll
+++ b/test/CodeGen/X86/fold-pcmpeqd-2.ll
@@ -54,22 +54,27 @@ forbody:		; preds = %forcond
 	%mul310 = fmul <4 x float> %bitcast204.i104, zeroinitializer		; <<4 x float>> [#uses=2]
 	%mul313 = fmul <4 x float> %bitcast204.i, zeroinitializer		; <<4 x float>> [#uses=1]
 	%cmpunord.i11 = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> zeroinitializer, <4 x float> zeroinitializer, i8 3) nounwind		; <<4 x float>> [#uses=1]
+	%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
+	%bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32>		; <<4 x i32>> [#uses=1]
+	%andps.i5 = and <4 x i32> %bitcast.i3, zeroinitializer		; <<4 x i32>> [#uses=1]
+
+	call void null(<4 x float> %mul313, <4 x float> %cmpunord.i11, <4 x float> %tmp83, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
+
+	%tmp84 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul313, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
+
 	%bitcast6.i13 = bitcast <4 x float> %cmpunord.i11 to <4 x i32>		; <<4 x i32>> [#uses=2]
 	%andps.i14 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %bitcast6.i13		; <<4 x i32>> [#uses=1]
 	%not.i16 = xor <4 x i32> %bitcast6.i13, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
 	%andnps.i17 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %not.i16		; <<4 x i32>> [#uses=1]
 	%orps.i18 = or <4 x i32> %andnps.i17, %andps.i14		; <<4 x i32>> [#uses=1]
 	%bitcast17.i19 = bitcast <4 x i32> %orps.i18 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp83 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul310, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
-	%bitcast.i3 = bitcast <4 x float> %mul310 to <4 x i32>		; <<4 x i32>> [#uses=1]
-	%andps.i5 = and <4 x i32> %bitcast.i3, zeroinitializer		; <<4 x i32>> [#uses=1]
+
 	%bitcast11.i6 = bitcast <4 x float> %tmp83 to <4 x i32>		; <<4 x i32>> [#uses=1]
 	%not.i7 = xor <4 x i32> zeroinitializer, < i32 -1, i32 -1, i32 -1, i32 -1 >		; <<4 x i32>> [#uses=1]
 	%andnps.i8 = and <4 x i32> %bitcast11.i6, %not.i7		; <<4 x i32>> [#uses=1]
-	call void null(<4 x float> %mul313, <4 x float> %cmpunord.i11, <4 x float> %tmp83, <4 x float> zeroinitializer, %struct.__ImageExecInfo* null, <4 x i32> zeroinitializer) nounwind
 	%orps.i9 = or <4 x i32> %andnps.i8, %andps.i5		; <<4 x i32>> [#uses=1]
 	%bitcast17.i10 = bitcast <4 x i32> %orps.i9 to <4 x float>		; <<4 x float>> [#uses=1]
-	%tmp84 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %mul313, <4 x float> zeroinitializer) nounwind		; <<4 x float>> [#uses=1]
+
 	%bitcast6.i = bitcast <4 x float> zeroinitializer to <4 x i32>		; <<4 x i32>> [#uses=2]
 	%andps.i = and <4 x i32> zeroinitializer, %bitcast6.i		; <<4 x i32>> [#uses=1]
 	%bitcast11.i = bitcast <4 x float> %tmp84 to <4 x i32>		; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/fp-elim.ll b/test/CodeGen/X86/fp-elim.ll
index 583388c..2c50bd1 100644
--- a/test/CodeGen/X86/fp-elim.ll
+++ b/test/CodeGen/X86/fp-elim.ll
@@ -4,7 +4,7 @@
 ; Implement -momit-leaf-frame-pointer
 ; rdar://7886181
 
-define i32 @t1() "no-frame-pointer-elim-non-leaf"="false" nounwind readnone {
+define i32 @t1() nounwind readnone {
 entry:
 ; FP-ELIM-LABEL:  t1:
 ; FP-ELIM-NEXT:     movl
@@ -17,7 +17,7 @@ entry:
   ret i32 10
 }
 
-define void @t2() "no-frame-pointer-elim-non-leaf"="false" nounwind {
+define void @t2() nounwind {
 entry:
 ; FP-ELIM-LABEL:  t2:
 ; FP-ELIM-NOT:      pushl %ebp
@@ -31,7 +31,7 @@ entry:
   ret void
 }
 
-define i32 @t3() "no-frame-pointer-elim-non-leaf"="true" nounwind readnone {
+define i32 @t3() "no-frame-pointer-elim-non-leaf" nounwind readnone {
 entry:
 ; FP-ELIM-LABEL:  t3:
 ; FP-ELIM-NEXT:     movl
@@ -44,7 +44,7 @@ entry:
   ret i32 10
 }
 
-define void @t4() "no-frame-pointer-elim-non-leaf"="true" nounwind {
+define void @t4() "no-frame-pointer-elim-non-leaf" nounwind {
 entry:
 ; FP-ELIM-LABEL:  t4:
 ; FP-ELIM-NEXT:     pushl %ebp
diff --git a/test/CodeGen/X86/fp-une-cmp.ll b/test/CodeGen/X86/fp-une-cmp.ll
new file mode 100644
index 0000000..7f772d1
--- /dev/null
+++ b/test/CodeGen/X86/fp-une-cmp.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=x86 -mattr=sse4.1 | FileCheck %s
+; <rdar://problem/7859988>
+
+; Make sure we don't generate more jumps than we need to. We used to generate
+; something like this:
+;
+;       jne  LBB0_1
+;       jnp  LBB0_2
+;   LBB0_1:
+;       jmp  LBB0_3
+;   LBB0_2:
+;       addsd ...
+;   LBB0_3:
+;
+; Now we generate this:
+;
+;       jne  LBB0_2
+;       jp   LBB0_2
+;       addsd ...
+;   LBB0_2:
+
+; CHECK:       func
+; CHECK:       jne [[LABEL:.*]]
+; CHECK-NEXT:  jp  [[LABEL]]
+; CHECK-NOT:   jmp
+
+define float @func(float %x, float %y) nounwind readnone optsize ssp {
+entry:
+  %0 = fpext float %x to double
+  %1 = fpext float %y to double
+  %2 = fmul double %0, %1
+  %3 = fcmp une double %2, 0.000000e+00
+  br i1 %3, label %bb2, label %bb1
+
+bb1:
+  %4 = fadd double %2, -1.000000e+00
+  br label %bb2
+
+bb2:
+  %.0.in = phi double [ %4, %bb1 ], [ %2, %entry ]
+  %.0 = fptrunc double %.0.in to float
+  ret float %.0
+}
diff --git a/test/CodeGen/X86/frame-base.ll b/test/CodeGen/X86/frame-base.ll
new file mode 100644
index 0000000..a6bd2a5
--- /dev/null
+++ b/test/CodeGen/X86/frame-base.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=x86_64-apple-macosx -o - %s | FileCheck %s
+
+; The issue here was a conflict between forming a %rip-relative lea and a
+; FrameIndex lea. The %rip sanity-checks didn't consider that a base register
+; had been set if we'd already matched a FrameIndex, when it has in reality.
+
+@var = global i32 0
+
+define void @test_frame_rip_conflict() {
+; CHECK-LABEL: test_frame_rip_conflict:
+; CHECK: leaq _var(%rip), [[TMPADDR:%r.*]]
+; CHECK: leaq {{-?[0-9]+}}(%rsp,[[TMPADDR]]),
+  %stackvar = alloca i32
+
+  %stackint = ptrtoint i32* %stackvar to i64
+  %addr = add i64 ptrtoint(i32* @var to i64), %stackint
+
+  call void @eat_i64(i64 %addr)
+  ret void
+}
+
+declare void @eat_i64(i64)
diff --git a/test/CodeGen/X86/full-lsr.ll b/test/CodeGen/X86/full-lsr.ll
index 0729dda..cbcc62a 100644
--- a/test/CodeGen/X86/full-lsr.ll
+++ b/test/CodeGen/X86/full-lsr.ll
@@ -4,7 +4,7 @@
 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
 ; ATOM: foo
 ; ATOM: addl
-; ATOM: leal
+; ATOM: addl
 ; ATOM: leal
 
 ; CHECK: foo
diff --git a/test/CodeGen/X86/gather-addresses.ll b/test/CodeGen/X86/gather-addresses.ll
index 72a5096..5f48b1e 100644
--- a/test/CodeGen/X86/gather-addresses.ll
+++ b/test/CodeGen/X86/gather-addresses.ll
@@ -1,21 +1,35 @@
-; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s
-; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN
+; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=WIN
 ; rdar://7398554
 
 ; When doing vector gather-scatter index calculation with 32-bit indices,
 ; bounce the vector off of cache rather than shuffling each individual
 ; element out of the index vector.
 
-; CHECK: andps    ([[H:%rdx|%r8]]), %xmm0
-; CHECK: movaps   %xmm0, {{(-24)?}}(%rsp)
-; CHECK: movslq   {{(-24)?}}(%rsp), %rax
-; CHECK: movsd    ([[P:%rdi|%rcx]],%rax,8), %xmm0
-; CHECK: movslq   {{-20|4}}(%rsp), %rax
-; CHECK: movhpd   ([[P]],%rax,8), %xmm0
-; CHECK: movslq   {{-16|8}}(%rsp), %rax
-; CHECK: movsd    ([[P]],%rax,8), %xmm1
-; CHECK: movslq   {{-12|12}}(%rsp), %rax
-; CHECK: movhpd   ([[P]],%rax,8), %xmm1
+; CHECK: foo:
+; LIN: movaps	(%rsi), %xmm0
+; LIN: andps	(%rdx), %xmm0
+; LIN: movaps	%xmm0, -24(%rsp)
+; LIN: movslq	-24(%rsp), %[[REG1:r.+]]
+; LIN: movslq	-20(%rsp), %[[REG2:r.+]]
+; LIN: movslq	-16(%rsp), %[[REG3:r.+]]
+; LIN: movslq	-12(%rsp), %[[REG4:r.+]]
+; LIN: movsd	(%rdi,%[[REG1]],8), %xmm0
+; LIN: movhpd	(%rdi,%[[REG2]],8), %xmm0
+; LIN: movsd	(%rdi,%[[REG3]],8), %xmm1
+; LIN: movhpd	(%rdi,%[[REG4]],8), %xmm1
+
+; WIN: movaps	(%rdx), %xmm0
+; WIN: andps	(%r8), %xmm0
+; WIN: movaps	%xmm0, (%rsp)
+; WIN: movslq	(%rsp), %[[REG1:r.+]]
+; WIN: movslq	4(%rsp), %[[REG2:r.+]]
+; WIN: movslq	8(%rsp), %[[REG3:r.+]]
+; WIN: movslq	12(%rsp), %[[REG4:r.+]]
+; WIN: movsd	(%rcx,%[[REG1]],8), %xmm0
+; WIN: movhpd	(%rcx,%[[REG2]],8), %xmm0
+; WIN: movsd	(%rcx,%[[REG3]],8), %xmm1
+; WIN: movhpd	(%rcx,%[[REG4]],8), %xmm1
 
 define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
   %a = load <4 x i32>* %i
diff --git a/test/CodeGen/X86/ghc-cc.ll b/test/CodeGen/X86/ghc-cc.ll
index 0e65cfd..4dba2c0 100644
--- a/test/CodeGen/X86/ghc-cc.ll
+++ b/test/CodeGen/X86/ghc-cc.ll
@@ -28,10 +28,10 @@ entry:
 
 define cc 10 void @foo() nounwind {
 entry:
-  ; CHECK: movl base, %ebx
-  ; CHECK-NEXT: movl sp, %ebp
+  ; CHECK:      movl r1, %esi
   ; CHECK-NEXT: movl hp, %edi
-  ; CHECK-NEXT: movl r1, %esi
+  ; CHECK-NEXT: movl sp, %ebp
+  ; CHECK-NEXT: movl base, %ebx
   %0 = load i32* @r1
   %1 = load i32* @hp
   %2 = load i32* @sp
@@ -42,4 +42,3 @@ entry:
 }
 
 declare cc 10 void @bar(i32, i32, i32, i32)
-
diff --git a/test/CodeGen/X86/ghc-cc64.ll b/test/CodeGen/X86/ghc-cc64.ll
index fcf7e17..403391e 100644
--- a/test/CodeGen/X86/ghc-cc64.ll
+++ b/test/CodeGen/X86/ghc-cc64.ll
@@ -41,22 +41,22 @@ entry:
 
 define cc 10 void @foo() nounwind {
 entry:
-  ; CHECK: movq base(%rip), %r13
-  ; CHECK-NEXT: movq sp(%rip), %rbp
-  ; CHECK-NEXT: movq hp(%rip), %r12
-  ; CHECK-NEXT: movq r1(%rip), %rbx
-  ; CHECK-NEXT: movq r2(%rip), %r14
-  ; CHECK-NEXT: movq r3(%rip), %rsi
-  ; CHECK-NEXT: movq r4(%rip), %rdi
-  ; CHECK-NEXT: movq r5(%rip), %r8
-  ; CHECK-NEXT: movq r6(%rip), %r9
-  ; CHECK-NEXT: movq splim(%rip), %r15
-  ; CHECK-NEXT: movss f1(%rip), %xmm1
-  ; CHECK-NEXT: movss f2(%rip), %xmm2
-  ; CHECK-NEXT: movss f3(%rip), %xmm3
-  ; CHECK-NEXT: movss f4(%rip), %xmm4
+  ; CHECK:      movsd d2(%rip), %xmm6
   ; CHECK-NEXT: movsd d1(%rip), %xmm5
-  ; CHECK-NEXT: movsd d2(%rip), %xmm6
+  ; CHECK-NEXT: movss f4(%rip), %xmm4
+  ; CHECK-NEXT: movss f3(%rip), %xmm3
+  ; CHECK-NEXT: movss f2(%rip), %xmm2
+  ; CHECK-NEXT: movss f1(%rip), %xmm1
+  ; CHECK-NEXT: movq splim(%rip), %r15
+  ; CHECK-NEXT: movq r6(%rip), %r9
+  ; CHECK-NEXT: movq r5(%rip), %r8
+  ; CHECK-NEXT: movq r4(%rip), %rdi
+  ; CHECK-NEXT: movq r3(%rip), %rsi
+  ; CHECK-NEXT: movq r2(%rip), %r14
+  ; CHECK-NEXT: movq r1(%rip), %rbx
+  ; CHECK-NEXT: movq hp(%rip), %r12
+  ; CHECK-NEXT: movq sp(%rip), %rbp
+  ; CHECK-NEXT: movq base(%rip), %r13
   %0 = load double* @d2
   %1 = load double* @d1
   %2 = load float* @f4
@@ -83,4 +83,3 @@ entry:
 
 declare cc 10 void @bar(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64,
                         float, float, float, float, double, double)
-
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index 194f597..d8743ac 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -65,10 +65,10 @@
 ; PR4584
 @"foo bar" = linkonce global i32 42
 
-; LINUX: .type	foo_20_bar,@object
-; LINUX: .section .data.foo_20_bar,"aGw",@progbits,foo_20_bar,comdat
-; LINUX: .weak	foo_20_bar
-; LINUX: foo_20_bar:
+; LINUX: .type	"foo bar",@object
+; LINUX: .section ".data.foo bar","aGw",@progbits,"foo bar",comdat
+; LINUX: .weak	"foo bar"
+; LINUX: "foo bar":
 
 ; DARWIN: .section		__DATA,__datacoal_nt,coalesced
 ; DARWIN: .globl	"_foo bar"
diff --git a/test/CodeGen/X86/h-register-addressing-32.ll b/test/CodeGen/X86/h-register-addressing-32.ll
index 968a9e8..68e8c60 100644
--- a/test/CodeGen/X86/h-register-addressing-32.ll
+++ b/test/CodeGen/X86/h-register-addressing-32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "movzbl	%[abcd]h," | count 7
+; RUN: llc < %s -march=x86 -mattr=-bmi | FileCheck %s
 
 ; Use h-register extract and zero-extend.
 
@@ -9,6 +9,9 @@ define double @foo8(double* nocapture inreg %p, i32 inreg %x) nounwind readonly
   %t3 = load double* %t2, align 8
   ret double %t3
 }
+; CHECK: foo8:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define float @foo4(float* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 8
   %t1 = and i32 %t0, 255
@@ -16,6 +19,9 @@ define float @foo4(float* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load float* %t2, align 8
   ret float %t3
 }
+; CHECK: foo4:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i16 @foo2(i16* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 8
   %t1 = and i32 %t0, 255
@@ -23,6 +29,9 @@ define i16 @foo2(i16* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load i16* %t2, align 8
   ret i16 %t3
 }
+; CHECK: foo2:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @foo1(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 8
   %t1 = and i32 %t0, 255
@@ -30,6 +39,9 @@ define i8 @foo1(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: foo1:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar8(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 5
   %t1 = and i32 %t0, 2040
@@ -37,6 +49,9 @@ define i8 @bar8(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar8:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar4(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 6
   %t1 = and i32 %t0, 1020
@@ -44,6 +59,9 @@ define i8 @bar4(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar4:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar2(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t0 = lshr i32 %x, 7
   %t1 = and i32 %t0, 510
@@ -51,3 +69,6 @@ define i8 @bar2(i8* nocapture inreg %p, i32 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar2:
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: ret
diff --git a/test/CodeGen/X86/h-register-addressing-64.ll b/test/CodeGen/X86/h-register-addressing-64.ll
index a19fca5..3f549d2 100644
--- a/test/CodeGen/X86/h-register-addressing-64.ll
+++ b/test/CodeGen/X86/h-register-addressing-64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep "movzbl	%[abcd]h," | count 7
+; RUN: llc < %s -march=x86-64 -mattr=-bmi | FileCheck %s
 
 ; Use h-register extract and zero-extend.
 
@@ -9,6 +9,9 @@ define double @foo8(double* nocapture inreg %p, i64 inreg %x) nounwind readonly
   %t3 = load double* %t2, align 8
   ret double %t3
 }
+; CHECK: foo8:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define float @foo4(float* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 8
   %t1 = and i64 %t0, 255
@@ -16,6 +19,9 @@ define float @foo4(float* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load float* %t2, align 8
   ret float %t3
 }
+; CHECK: foo4:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i16 @foo2(i16* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 8
   %t1 = and i64 %t0, 255
@@ -23,6 +29,9 @@ define i16 @foo2(i16* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load i16* %t2, align 8
   ret i16 %t3
 }
+; CHECK: foo2:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @foo1(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 8
   %t1 = and i64 %t0, 255
@@ -30,6 +39,9 @@ define i8 @foo1(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: foo1:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar8(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 5
   %t1 = and i64 %t0, 2040
@@ -37,6 +49,9 @@ define i8 @bar8(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar8:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar4(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 6
   %t1 = and i64 %t0, 1020
@@ -44,6 +59,9 @@ define i8 @bar4(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar4:
+; CHECK: movzbl %{{[abcd]}}h, %e
+
 define i8 @bar2(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t0 = lshr i64 %x, 7
   %t1 = and i64 %t0, 510
@@ -51,3 +69,6 @@ define i8 @bar2(i8* nocapture inreg %p, i64 inreg %x) nounwind readonly {
   %t3 = load i8* %t2, align 8
   ret i8 %t3
 }
+; CHECK: bar2:
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: ret
diff --git a/test/CodeGen/X86/h-registers-0.ll b/test/CodeGen/X86/h-registers-0.ll
index 71b3b43..6a5ccaa 100644
--- a/test/CodeGen/X86/h-registers-0.ll
+++ b/test/CodeGen/X86/h-registers-0.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86-64
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=WIN64
-; RUN: llc < %s -march=x86    | FileCheck %s -check-prefix=X86-32
+; RUN: llc < %s -mattr=-bmi -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86-64
+; RUN: llc < %s -mattr=-bmi -mtriple=x86_64-win32 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mattr=-bmi -march=x86    | FileCheck %s -check-prefix=X86-32
 
 ; Use h registers. On x86-64, codegen doesn't support general allocation
 ; of h registers yet, due to x86 encoding complications.
diff --git a/test/CodeGen/X86/h-registers-1.ll b/test/CodeGen/X86/h-registers-1.ll
index 903c453..7254325 100644
--- a/test/CodeGen/X86/h-registers-1.ll
+++ b/test/CodeGen/X86/h-registers-1.ll
@@ -1,12 +1,21 @@
-; RUN: llc < %s -mtriple=x86_64-linux > %t
-; RUN: grep "movzbl	%[abcd]h," %t | count 8
-; RUN: grep "%[abcd]h" %t | not grep "%r[[:digit:]]*d"
+; RUN: llc -mattr=-bmi < %s -mtriple=x86_64-linux | FileCheck %s
 
 ; LLVM creates virtual registers for values live across blocks
 ; based on the type of the value. Make sure that the extracts
 ; here use the GR64_NOREX register class for their result,
 ; instead of plain GR64.
 
+; CHECK: foo:
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: movzbl %{{[abcd]}}h, %e
+; CHECK: ret
+
 define i64 @foo(i64 %a, i64 %b, i64 %c, i64 %d,
                 i64 %e, i64 %f, i64 %g, i64 %h) {
   %sa = lshr i64 %a, 8
diff --git a/test/CodeGen/X86/hipe-cc.ll b/test/CodeGen/X86/hipe-cc.ll
index 76d17a0..b34417e 100644
--- a/test/CodeGen/X86/hipe-cc.ll
+++ b/test/CodeGen/X86/hipe-cc.ll
@@ -49,10 +49,10 @@ entry:
   store i32 %arg1, i32* %arg1_var
   store i32 %arg2, i32* %arg2_var
 
-  ; CHECK:      movl   4(%esp), %edx
-  ; CHECK-NEXT: movl   8(%esp), %eax
+  ; CHECK:      movl  16(%esp), %esi
   ; CHECK-NEXT: movl  12(%esp), %ebp
-  ; CHECK-NEXT: movl  16(%esp), %esi
+  ; CHECK-NEXT: movl   8(%esp), %eax
+  ; CHECK-NEXT: movl   4(%esp), %edx
   %0 = load i32* %hp_var
   %1 = load i32* %p_var
   %2 = load i32* %arg0_var
diff --git a/test/CodeGen/X86/hipe-cc64.ll b/test/CodeGen/X86/hipe-cc64.ll
index 5dbb5a2..27e1c72 100644
--- a/test/CodeGen/X86/hipe-cc64.ll
+++ b/test/CodeGen/X86/hipe-cc64.ll
@@ -5,10 +5,10 @@
 define void @zap(i64 %a, i64 %b) nounwind {
 entry:
   ; CHECK:      movq %rsi, %rax
-  ; CHECK-NEXT: movq %rdi, %rsi
-  ; CHECK-NEXT: movq %rax, %rdx
   ; CHECK-NEXT: movl $8, %ecx
   ; CHECK-NEXT: movl $9, %r8d
+  ; CHECK-NEXT: movq %rdi, %rsi
+  ; CHECK-NEXT: movq %rax, %rdx
   ; CHECK-NEXT: callq addfour
   %0 = call cc 11 {i64, i64, i64} @addfour(i64 undef, i64 undef, i64 %a, i64 %b, i64 8, i64 9)
   %res = extractvalue {i64, i64, i64} %0, 2
@@ -57,11 +57,11 @@ entry:
   store i64 %arg2, i64* %arg2_var
   store i64 %arg3, i64* %arg3_var
 
-  ; CHECK:      movq  8(%rsp), %rcx
-  ; CHECK-NEXT: movq  16(%rsp), %rdx
-  ; CHECK-NEXT: movq  24(%rsp), %rsi
+  ; CHECK:      movq  40(%rsp), %r15
   ; CHECK-NEXT: movq  32(%rsp), %rbp
-  ; CHECK-NEXT: movq  40(%rsp), %r15
+  ; CHECK-NEXT: movq  24(%rsp), %rsi
+  ; CHECK-NEXT: movq  16(%rsp), %rdx
+  ; CHECK-NEXT: movq  8(%rsp), %rcx
   %0 = load i64* %hp_var
   %1 = load i64* %p_var
   %2 = load i64* %arg0_var
diff --git a/test/CodeGen/X86/hoist-common.ll b/test/CodeGen/X86/hoist-common.ll
index 6b26876..01d1b8c 100644
--- a/test/CodeGen/X86/hoist-common.ll
+++ b/test/CodeGen/X86/hoist-common.ll
@@ -1,4 +1,14 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx  | FileCheck %s
+; This is supposed to be testing BranchFolding's common
+; code hoisting logic, but has been erroneously passing due
+; to there being a redundant xorl in the entry block
+; and no common code to hoist.
+; However, now that MachineSink sinks the redundant xor
+; hoist-common looks at it and rejects it for hoisting,
+; which causes this test to fail.
+; Since it seems this test is broken, marking XFAIL for now
+; until someone decides to remove it or fix what it tests.
+; XFAIL: *
 
 ; Common "xorb al, al" instruction in the two successor blocks should be
 ; moved to the entry block above the test + je.
diff --git a/test/CodeGen/X86/i128-mul.ll b/test/CodeGen/X86/i128-mul.ll
index c0b85df..8cfda85 100644
--- a/test/CodeGen/X86/i128-mul.ll
+++ b/test/CodeGen/X86/i128-mul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
 ; PR1198
 
 define i64 @foo(i64 %x, i64 %y) {
diff --git a/test/CodeGen/X86/i486-fence-loop.ll b/test/CodeGen/X86/i486-fence-loop.ll
new file mode 100644
index 0000000..d809619
--- /dev/null
+++ b/test/CodeGen/X86/i486-fence-loop.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=x86 -mcpu=i486 -o - %s | FileCheck %s
+
+; Main test here was that ISelDAG could cope with a MachineNode in the chain
+; from the first load to the "X86ISD::SUB". Previously it thought that meant no
+; cycle could be formed so it tried to use "sub (%eax), [[RHS]]".
+
+define void @gst_atomic_queue_push(i32* %addr) {
+; CHECK-LABEL: gst_atomic_queue_push:
+; CHECK: movl (%eax), [[LHS:%e[a-z]+]]
+; CHECK: lock
+; CHECK-NEXT: orl
+; CHECK: movl (%eax), [[RHS:%e[a-z]+]]
+; CHECK: cmpl [[LHS]], [[RHS]]
+
+entry:
+  br label %while.body
+
+while.body:
+  %0 = load volatile i32* %addr, align 4
+  fence seq_cst
+  %1 = load volatile i32* %addr, align 4
+  %cmp = icmp sgt i32 %1, %0
+  br i1 %cmp, label %while.body, label %if.then
+
+if.then:
+  ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/ident-metadata.ll b/test/CodeGen/X86/ident-metadata.ll
new file mode 100644
index 0000000..a568673
--- /dev/null
+++ b/test/CodeGen/X86/ident-metadata.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+; Verify that llvm.ident metadata is emitted as .ident
+; directives in assembly files, and in the .comment section in ELF object files.
+
+; CHECK: .ident  "clang version x.x"
+; CHECK-NEXT: .ident  "something else"
+!llvm.ident = !{!0, !1}
+!0 = metadata !{metadata !"clang version x.x"}
+!1 = metadata !{metadata !"something else"}
diff --git a/test/CodeGen/X86/inline-asm-error.ll b/test/CodeGen/X86/inline-asm-error.ll
index 747a589..31fb190 100644
--- a/test/CodeGen/X86/inline-asm-error.ll
+++ b/test/CodeGen/X86/inline-asm-error.ll
@@ -6,7 +6,7 @@
 ; RUN: FileCheck %s < %t3
 
 ; The register allocator must fail on this function.
-; CHECK: error: ran out of registers during register allocation
+; CHECK: error: inline assembly requires more registers than available
 
 define void @f(i32 %x0, i32 %x1, i32 %x2, i32 %x3, i32 %x4, i32 %x5, i32 %x6, i32 %x7, i32 %x8, i32 %x9) nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/inline-asm-flag-clobber.ll b/test/CodeGen/X86/inline-asm-flag-clobber.ll
index 51ea843..45f4d2f 100644
--- a/test/CodeGen/X86/inline-asm-flag-clobber.ll
+++ b/test/CodeGen/X86/inline-asm-flag-clobber.ll
@@ -2,18 +2,31 @@
 ; PR3701
 
 define i64 @t(i64* %arg) nounwind {
-	br i1 true, label %1, label %5
+        br i1 true, label %1, label %5
 
-; <label>:1		; preds = %0
-	%2 = icmp eq i64* null, %arg		; <i1> [#uses=1]
-	%3 = tail call i64* asm sideeffect "movl %fs:0,$0", "=r,~{dirflag},~{fpsr},~{flags}"() nounwind		; <%struct.thread*> [#uses=0]
+; <label>:1             ; preds = %0
+        %2 = icmp eq i64* null, %arg            ; <i1> [#uses=1]
+        %3 = tail call i64* asm sideeffect "movl %fs:0,$0", "=r,~{dirflag},~{fpsr},~{flags}"() nounwind         ; <%struct.thread*> [#uses=0]
 ; CHECK: test
 ; CHECK-NEXT: j
-	br i1 %2, label %4, label %5
+        br i1 %2, label %4, label %5
 
-; <label>:4		; preds = %1
-	ret i64 1
+; <label>:4             ; preds = %1
+        ret i64 1
 
-; <label>:5		; preds = %1
-	ret i64 0
+; <label>:5             ; preds = %1
+        ret i64 0
 }
+
+; Make sure that we translate this to the bswap intrinsic which lowers down without the
+; inline assembly.
+; CHECK-NOT: #APP
+define i32 @s(i32 %argc, i8** nocapture %argv) unnamed_addr nounwind {
+entry:
+  %0 = trunc i32 %argc to i16
+  %asmtmp = tail call i16 asm "rorw $$8, ${0:w}", "=r,0,~{fpsr},~{flags},~{cc}"(i16 %0) nounwind, !srcloc !0
+  %1 = zext i16 %asmtmp to i32
+  ret i32 %1
+}
+
+!0 = metadata !{i64 935930}
diff --git a/test/CodeGen/X86/ins_subreg_coalesce-1.ll b/test/CodeGen/X86/ins_subreg_coalesce-1.ll
index bec98a2..a74e3f2 100644
--- a/test/CodeGen/X86/ins_subreg_coalesce-1.ll
+++ b/test/CodeGen/X86/ins_subreg_coalesce-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=-bmi | FileCheck %s
 
 define fastcc i32 @t() nounwind  {
 entry:
diff --git a/test/CodeGen/X86/isel-optnone.ll b/test/CodeGen/X86/isel-optnone.ll
new file mode 100644
index 0000000..d2f0628
--- /dev/null
+++ b/test/CodeGen/X86/isel-optnone.ll
@@ -0,0 +1,42 @@
+; RUN: llc -O2 -march=x86 < %s | FileCheck %s
+
+define i32* @fooOptnone(i32* %p, i32* %q, i32** %z) #0 {
+entry:
+  %r = load i32* %p
+  %s = load i32* %q
+  %y = load i32** %z
+
+  %t0 = add i32 %r, %s
+  %t1 = add i32 %t0, 1
+  %t2 = getelementptr i32* %y, i32 1
+  %t3 = getelementptr i32* %t2, i32 %t1
+
+  ret i32* %t3
+
+; 'optnone' should use fast-isel which will not produce 'lea'.
+; CHECK-LABEL: fooOptnone:
+; CHECK-NOT:   lea
+; CHECK:       ret
+}
+
+define i32* @fooNormal(i32* %p, i32* %q, i32** %z) #1 {
+entry:
+  %r = load i32* %p
+  %s = load i32* %q
+  %y = load i32** %z
+
+  %t0 = add i32 %r, %s
+  %t1 = add i32 %t0, 1
+  %t2 = getelementptr i32* %y, i32 1
+  %t3 = getelementptr i32* %t2, i32 %t1
+
+  ret i32* %t3
+
+; Normal ISel will produce 'lea'.
+; CHECK-LABEL: fooNormal:
+; CHECK:       lea
+; CHECK:       ret
+}
+
+attributes #0 = { nounwind optnone noinline }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/X86/large-gep-chain.ll b/test/CodeGen/X86/large-gep-chain.ll
new file mode 100644
index 0000000..5cf4661
--- /dev/null
+++ b/test/CodeGen/X86/large-gep-chain.ll
@@ -0,0 +1,25607 @@
+; RUN: llc < %s -O0 -march x86 -o /dev/null
+; <rdar://problem/12445434>
+
+%0 = type { i32, float* }
+
+@0 = external unnamed_addr constant [27 x i8], align 1
+@1 = external unnamed_addr constant [26 x i8], align 1
+@2 = external unnamed_addr constant [18 x i8], align 1
+@3 = external unnamed_addr constant [15 x i8], align 1
+@4 = external unnamed_addr constant [20 x i8], align 1
+@5 = external unnamed_addr constant [21 x i8], align 1
+@6 = external unnamed_addr constant [12 x i8], align 1
+@7 = external unnamed_addr constant [27 x i8], align 1
+@8 = external unnamed_addr constant [63 x i8], align 1
+
+define void @main() uwtable ssp {
+bb:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:                                              ; preds = %bb
+  br label %bb25362
+
+bb2:                                              ; preds = %bb
+  %tmp = getelementptr inbounds float* null, i64 1
+  %tmp3 = getelementptr inbounds float* %tmp, i64 1
+  %tmp4 = getelementptr inbounds float* %tmp3, i64 1
+  %tmp5 = getelementptr inbounds float* %tmp4, i64 1
+  %tmp6 = getelementptr inbounds float* %tmp5, i64 1
+  %tmp7 = getelementptr inbounds float* %tmp6, i64 1
+  %tmp8 = getelementptr inbounds float* %tmp7, i64 1
+  %tmp9 = getelementptr inbounds float* %tmp8, i64 1
+  %tmp10 = getelementptr inbounds float* %tmp9, i64 1
+  %tmp11 = getelementptr inbounds float* %tmp10, i64 1
+  %tmp12 = getelementptr inbounds float* %tmp11, i64 1
+  %tmp13 = getelementptr inbounds float* %tmp12, i64 1
+  %tmp14 = getelementptr inbounds float* %tmp13, i64 1
+  %tmp15 = getelementptr inbounds float* %tmp14, i64 1
+  %tmp16 = getelementptr inbounds float* %tmp15, i64 1
+  %tmp17 = getelementptr inbounds float* %tmp16, i64 1
+  %tmp18 = getelementptr inbounds float* %tmp17, i64 1
+  %tmp19 = getelementptr inbounds float* %tmp18, i64 1
+  %tmp20 = getelementptr inbounds float* %tmp19, i64 1
+  %tmp21 = getelementptr inbounds float* %tmp20, i64 1
+  %tmp22 = getelementptr inbounds float* %tmp21, i64 1
+  %tmp23 = getelementptr inbounds float* %tmp22, i64 1
+  %tmp24 = getelementptr inbounds float* %tmp23, i64 1
+  %tmp25 = getelementptr inbounds float* %tmp24, i64 1
+  %tmp26 = getelementptr inbounds float* %tmp25, i64 1
+  %tmp27 = getelementptr inbounds float* %tmp26, i64 1
+  %tmp28 = getelementptr inbounds float* %tmp27, i64 1
+  %tmp29 = getelementptr inbounds float* %tmp28, i64 1
+  %tmp30 = getelementptr inbounds float* %tmp29, i64 1
+  %tmp31 = getelementptr inbounds float* %tmp30, i64 1
+  %tmp32 = getelementptr inbounds float* %tmp31, i64 1
+  %tmp33 = getelementptr inbounds float* %tmp32, i64 1
+  %tmp34 = getelementptr inbounds float* %tmp33, i64 1
+  %tmp35 = getelementptr inbounds float* %tmp34, i64 1
+  %tmp36 = getelementptr inbounds float* %tmp35, i64 1
+  %tmp37 = getelementptr inbounds float* %tmp36, i64 1
+  %tmp38 = getelementptr inbounds float* %tmp37, i64 1
+  %tmp39 = getelementptr inbounds float* %tmp38, i64 1
+  %tmp40 = getelementptr inbounds float* %tmp39, i64 1
+  %tmp41 = getelementptr inbounds float* %tmp40, i64 1
+  %tmp42 = getelementptr inbounds float* %tmp41, i64 1
+  %tmp43 = getelementptr inbounds float* %tmp42, i64 1
+  %tmp44 = getelementptr inbounds float* %tmp43, i64 1
+  %tmp45 = getelementptr inbounds float* %tmp44, i64 1
+  %tmp46 = getelementptr inbounds float* %tmp45, i64 1
+  %tmp47 = getelementptr inbounds float* %tmp46, i64 1
+  %tmp48 = getelementptr inbounds float* %tmp47, i64 1
+  %tmp49 = getelementptr inbounds float* %tmp48, i64 1
+  %tmp50 = getelementptr inbounds float* %tmp49, i64 1
+  %tmp51 = getelementptr inbounds float* %tmp50, i64 1
+  %tmp52 = getelementptr inbounds float* %tmp51, i64 1
+  %tmp53 = getelementptr inbounds float* %tmp52, i64 1
+  %tmp54 = getelementptr inbounds float* %tmp53, i64 1
+  %tmp55 = getelementptr inbounds float* %tmp54, i64 1
+  %tmp56 = getelementptr inbounds float* %tmp55, i64 1
+  %tmp57 = getelementptr inbounds float* %tmp56, i64 1
+  %tmp58 = getelementptr inbounds float* %tmp57, i64 1
+  %tmp59 = getelementptr inbounds float* %tmp58, i64 1
+  %tmp60 = getelementptr inbounds float* %tmp59, i64 1
+  %tmp61 = getelementptr inbounds float* %tmp60, i64 1
+  %tmp62 = getelementptr inbounds float* %tmp61, i64 1
+  %tmp63 = getelementptr inbounds float* %tmp62, i64 1
+  %tmp64 = getelementptr inbounds float* %tmp63, i64 1
+  %tmp65 = getelementptr inbounds float* %tmp64, i64 1
+  %tmp66 = getelementptr inbounds float* %tmp65, i64 1
+  %tmp67 = getelementptr inbounds float* %tmp66, i64 1
+  %tmp68 = getelementptr inbounds float* %tmp67, i64 1
+  %tmp69 = getelementptr inbounds float* %tmp68, i64 1
+  %tmp70 = getelementptr inbounds float* %tmp69, i64 1
+  %tmp71 = getelementptr inbounds float* %tmp70, i64 1
+  %tmp72 = getelementptr inbounds float* %tmp71, i64 1
+  %tmp73 = getelementptr inbounds float* %tmp72, i64 1
+  %tmp74 = getelementptr inbounds float* %tmp73, i64 1
+  %tmp75 = getelementptr inbounds float* %tmp74, i64 1
+  %tmp76 = getelementptr inbounds float* %tmp75, i64 1
+  %tmp77 = getelementptr inbounds float* %tmp76, i64 1
+  %tmp78 = getelementptr inbounds float* %tmp77, i64 1
+  %tmp79 = getelementptr inbounds float* %tmp78, i64 1
+  %tmp80 = getelementptr inbounds float* %tmp79, i64 1
+  %tmp81 = getelementptr inbounds float* %tmp80, i64 1
+  %tmp82 = getelementptr inbounds float* %tmp81, i64 1
+  %tmp83 = getelementptr inbounds float* %tmp82, i64 1
+  %tmp84 = getelementptr inbounds float* %tmp83, i64 1
+  %tmp85 = getelementptr inbounds float* %tmp84, i64 1
+  %tmp86 = getelementptr inbounds float* %tmp85, i64 1
+  %tmp87 = getelementptr inbounds float* %tmp86, i64 1
+  %tmp88 = getelementptr inbounds float* %tmp87, i64 1
+  %tmp89 = getelementptr inbounds float* %tmp88, i64 1
+  %tmp90 = getelementptr inbounds float* %tmp89, i64 1
+  %tmp91 = getelementptr inbounds float* %tmp90, i64 1
+  %tmp92 = getelementptr inbounds float* %tmp91, i64 1
+  %tmp93 = getelementptr inbounds float* %tmp92, i64 1
+  %tmp94 = getelementptr inbounds float* %tmp93, i64 1
+  %tmp95 = getelementptr inbounds float* %tmp94, i64 1
+  %tmp96 = getelementptr inbounds float* %tmp95, i64 1
+  %tmp97 = getelementptr inbounds float* %tmp96, i64 1
+  %tmp98 = getelementptr inbounds float* %tmp97, i64 1
+  %tmp99 = getelementptr inbounds float* %tmp98, i64 1
+  %tmp100 = getelementptr inbounds float* %tmp99, i64 1
+  %tmp101 = getelementptr inbounds float* %tmp100, i64 1
+  %tmp102 = getelementptr inbounds float* %tmp101, i64 1
+  %tmp103 = getelementptr inbounds float* %tmp102, i64 1
+  %tmp104 = getelementptr inbounds float* %tmp103, i64 1
+  %tmp105 = getelementptr inbounds float* %tmp104, i64 1
+  %tmp106 = getelementptr inbounds float* %tmp105, i64 1
+  %tmp107 = getelementptr inbounds float* %tmp106, i64 1
+  %tmp108 = getelementptr inbounds float* %tmp107, i64 1
+  %tmp109 = getelementptr inbounds float* %tmp108, i64 1
+  %tmp110 = getelementptr inbounds float* %tmp109, i64 1
+  %tmp111 = getelementptr inbounds float* %tmp110, i64 1
+  %tmp112 = getelementptr inbounds float* %tmp111, i64 1
+  %tmp113 = getelementptr inbounds float* %tmp112, i64 1
+  %tmp114 = getelementptr inbounds float* %tmp113, i64 1
+  %tmp115 = getelementptr inbounds float* %tmp114, i64 1
+  %tmp116 = getelementptr inbounds float* %tmp115, i64 1
+  %tmp117 = getelementptr inbounds float* %tmp116, i64 1
+  %tmp118 = getelementptr inbounds float* %tmp117, i64 1
+  %tmp119 = getelementptr inbounds float* %tmp118, i64 1
+  %tmp120 = getelementptr inbounds float* %tmp119, i64 1
+  %tmp121 = getelementptr inbounds float* %tmp120, i64 1
+  %tmp122 = getelementptr inbounds float* %tmp121, i64 1
+  %tmp123 = getelementptr inbounds float* %tmp122, i64 1
+  %tmp124 = getelementptr inbounds float* %tmp123, i64 1
+  %tmp125 = getelementptr inbounds float* %tmp124, i64 1
+  %tmp126 = getelementptr inbounds float* %tmp125, i64 1
+  %tmp127 = getelementptr inbounds float* %tmp126, i64 1
+  %tmp128 = getelementptr inbounds float* %tmp127, i64 1
+  %tmp129 = getelementptr inbounds float* %tmp128, i64 1
+  %tmp130 = getelementptr inbounds float* %tmp129, i64 1
+  %tmp131 = getelementptr inbounds float* %tmp130, i64 1
+  %tmp132 = getelementptr inbounds float* %tmp131, i64 1
+  %tmp133 = getelementptr inbounds float* %tmp132, i64 1
+  %tmp134 = getelementptr inbounds float* %tmp133, i64 1
+  %tmp135 = getelementptr inbounds float* %tmp134, i64 1
+  %tmp136 = getelementptr inbounds float* %tmp135, i64 1
+  %tmp137 = getelementptr inbounds float* %tmp136, i64 1
+  %tmp138 = getelementptr inbounds float* %tmp137, i64 1
+  %tmp139 = getelementptr inbounds float* %tmp138, i64 1
+  %tmp140 = getelementptr inbounds float* %tmp139, i64 1
+  %tmp141 = getelementptr inbounds float* %tmp140, i64 1
+  %tmp142 = getelementptr inbounds float* %tmp141, i64 1
+  %tmp143 = getelementptr inbounds float* %tmp142, i64 1
+  %tmp144 = getelementptr inbounds float* %tmp143, i64 1
+  %tmp145 = getelementptr inbounds float* %tmp144, i64 1
+  %tmp146 = getelementptr inbounds float* %tmp145, i64 1
+  %tmp147 = getelementptr inbounds float* %tmp146, i64 1
+  %tmp148 = getelementptr inbounds float* %tmp147, i64 1
+  %tmp149 = getelementptr inbounds float* %tmp148, i64 1
+  %tmp150 = getelementptr inbounds float* %tmp149, i64 1
+  %tmp151 = getelementptr inbounds float* %tmp150, i64 1
+  %tmp152 = getelementptr inbounds float* %tmp151, i64 1
+  %tmp153 = getelementptr inbounds float* %tmp152, i64 1
+  %tmp154 = getelementptr inbounds float* %tmp153, i64 1
+  %tmp155 = getelementptr inbounds float* %tmp154, i64 1
+  %tmp156 = getelementptr inbounds float* %tmp155, i64 1
+  %tmp157 = getelementptr inbounds float* %tmp156, i64 1
+  %tmp158 = getelementptr inbounds float* %tmp157, i64 1
+  %tmp159 = getelementptr inbounds float* %tmp158, i64 1
+  %tmp160 = getelementptr inbounds float* %tmp159, i64 1
+  %tmp161 = getelementptr inbounds float* %tmp160, i64 1
+  %tmp162 = getelementptr inbounds float* %tmp161, i64 1
+  %tmp163 = getelementptr inbounds float* %tmp162, i64 1
+  %tmp164 = getelementptr inbounds float* %tmp163, i64 1
+  %tmp165 = getelementptr inbounds float* %tmp164, i64 1
+  %tmp166 = getelementptr inbounds float* %tmp165, i64 1
+  %tmp167 = getelementptr inbounds float* %tmp166, i64 1
+  %tmp168 = getelementptr inbounds float* %tmp167, i64 1
+  %tmp169 = getelementptr inbounds float* %tmp168, i64 1
+  %tmp170 = getelementptr inbounds float* %tmp169, i64 1
+  %tmp171 = getelementptr inbounds float* %tmp170, i64 1
+  %tmp172 = getelementptr inbounds float* %tmp171, i64 1
+  %tmp173 = getelementptr inbounds float* %tmp172, i64 1
+  %tmp174 = getelementptr inbounds float* %tmp173, i64 1
+  %tmp175 = getelementptr inbounds float* %tmp174, i64 1
+  %tmp176 = getelementptr inbounds float* %tmp175, i64 1
+  %tmp177 = getelementptr inbounds float* %tmp176, i64 1
+  %tmp178 = getelementptr inbounds float* %tmp177, i64 1
+  %tmp179 = getelementptr inbounds float* %tmp178, i64 1
+  %tmp180 = getelementptr inbounds float* %tmp179, i64 1
+  %tmp181 = getelementptr inbounds float* %tmp180, i64 1
+  %tmp182 = getelementptr inbounds float* %tmp181, i64 1
+  %tmp183 = getelementptr inbounds float* %tmp182, i64 1
+  %tmp184 = getelementptr inbounds float* %tmp183, i64 1
+  %tmp185 = getelementptr inbounds float* %tmp184, i64 1
+  %tmp186 = getelementptr inbounds float* %tmp185, i64 1
+  %tmp187 = getelementptr inbounds float* %tmp186, i64 1
+  %tmp188 = getelementptr inbounds float* %tmp187, i64 1
+  %tmp189 = getelementptr inbounds float* %tmp188, i64 1
+  %tmp190 = getelementptr inbounds float* %tmp189, i64 1
+  %tmp191 = getelementptr inbounds float* %tmp190, i64 1
+  %tmp192 = getelementptr inbounds float* %tmp191, i64 1
+  %tmp193 = getelementptr inbounds float* %tmp192, i64 1
+  %tmp194 = getelementptr inbounds float* %tmp193, i64 1
+  %tmp195 = getelementptr inbounds float* %tmp194, i64 1
+  %tmp196 = getelementptr inbounds float* %tmp195, i64 1
+  %tmp197 = getelementptr inbounds float* %tmp196, i64 1
+  %tmp198 = getelementptr inbounds float* %tmp197, i64 1
+  %tmp199 = getelementptr inbounds float* %tmp198, i64 1
+  %tmp200 = getelementptr inbounds float* %tmp199, i64 1
+  %tmp201 = getelementptr inbounds float* %tmp200, i64 1
+  %tmp202 = getelementptr inbounds float* %tmp201, i64 1
+  %tmp203 = getelementptr inbounds float* %tmp202, i64 1
+  %tmp204 = getelementptr inbounds float* %tmp203, i64 1
+  %tmp205 = getelementptr inbounds float* %tmp204, i64 1
+  %tmp206 = getelementptr inbounds float* %tmp205, i64 1
+  %tmp207 = getelementptr inbounds float* %tmp206, i64 1
+  %tmp208 = getelementptr inbounds float* %tmp207, i64 1
+  %tmp209 = getelementptr inbounds float* %tmp208, i64 1
+  %tmp210 = getelementptr inbounds float* %tmp209, i64 1
+  %tmp211 = getelementptr inbounds float* %tmp210, i64 1
+  %tmp212 = getelementptr inbounds float* %tmp211, i64 1
+  %tmp213 = getelementptr inbounds float* %tmp212, i64 1
+  %tmp214 = getelementptr inbounds float* %tmp213, i64 1
+  %tmp215 = getelementptr inbounds float* %tmp214, i64 1
+  %tmp216 = getelementptr inbounds float* %tmp215, i64 1
+  %tmp217 = getelementptr inbounds float* %tmp216, i64 1
+  %tmp218 = getelementptr inbounds float* %tmp217, i64 1
+  %tmp219 = getelementptr inbounds float* %tmp218, i64 1
+  %tmp220 = getelementptr inbounds float* %tmp219, i64 1
+  %tmp221 = getelementptr inbounds float* %tmp220, i64 1
+  %tmp222 = getelementptr inbounds float* %tmp221, i64 1
+  %tmp223 = getelementptr inbounds float* %tmp222, i64 1
+  %tmp224 = getelementptr inbounds float* %tmp223, i64 1
+  %tmp225 = getelementptr inbounds float* %tmp224, i64 1
+  %tmp226 = getelementptr inbounds float* %tmp225, i64 1
+  %tmp227 = getelementptr inbounds float* %tmp226, i64 1
+  %tmp228 = getelementptr inbounds float* %tmp227, i64 1
+  %tmp229 = getelementptr inbounds float* %tmp228, i64 1
+  %tmp230 = getelementptr inbounds float* %tmp229, i64 1
+  %tmp231 = getelementptr inbounds float* %tmp230, i64 1
+  %tmp232 = getelementptr inbounds float* %tmp231, i64 1
+  %tmp233 = getelementptr inbounds float* %tmp232, i64 1
+  %tmp234 = getelementptr inbounds float* %tmp233, i64 1
+  %tmp235 = getelementptr inbounds float* %tmp234, i64 1
+  %tmp236 = getelementptr inbounds float* %tmp235, i64 1
+  %tmp237 = getelementptr inbounds float* %tmp236, i64 1
+  %tmp238 = getelementptr inbounds float* %tmp237, i64 1
+  %tmp239 = getelementptr inbounds float* %tmp238, i64 1
+  %tmp240 = getelementptr inbounds float* %tmp239, i64 1
+  %tmp241 = getelementptr inbounds float* %tmp240, i64 1
+  %tmp242 = getelementptr inbounds float* %tmp241, i64 1
+  %tmp243 = getelementptr inbounds float* %tmp242, i64 1
+  %tmp244 = getelementptr inbounds float* %tmp243, i64 1
+  %tmp245 = getelementptr inbounds float* %tmp244, i64 1
+  %tmp246 = getelementptr inbounds float* %tmp245, i64 1
+  %tmp247 = getelementptr inbounds float* %tmp246, i64 1
+  %tmp248 = getelementptr inbounds float* %tmp247, i64 1
+  %tmp249 = getelementptr inbounds float* %tmp248, i64 1
+  %tmp250 = getelementptr inbounds float* %tmp249, i64 1
+  %tmp251 = getelementptr inbounds float* %tmp250, i64 1
+  %tmp252 = getelementptr inbounds float* %tmp251, i64 1
+  %tmp253 = getelementptr inbounds float* %tmp252, i64 1
+  %tmp254 = getelementptr inbounds float* %tmp253, i64 1
+  %tmp255 = getelementptr inbounds float* %tmp254, i64 1
+  %tmp256 = getelementptr inbounds float* %tmp255, i64 1
+  %tmp257 = getelementptr inbounds float* %tmp256, i64 1
+  %tmp258 = getelementptr inbounds float* %tmp257, i64 1
+  %tmp259 = getelementptr inbounds float* %tmp258, i64 1
+  %tmp260 = getelementptr inbounds float* %tmp259, i64 1
+  %tmp261 = getelementptr inbounds float* %tmp260, i64 1
+  %tmp262 = getelementptr inbounds float* %tmp261, i64 1
+  %tmp263 = getelementptr inbounds float* %tmp262, i64 1
+  %tmp264 = getelementptr inbounds float* %tmp263, i64 1
+  %tmp265 = getelementptr inbounds float* %tmp264, i64 1
+  %tmp266 = getelementptr inbounds float* %tmp265, i64 1
+  %tmp267 = getelementptr inbounds float* %tmp266, i64 1
+  %tmp268 = getelementptr inbounds float* %tmp267, i64 1
+  %tmp269 = getelementptr inbounds float* %tmp268, i64 1
+  %tmp270 = getelementptr inbounds float* %tmp269, i64 1
+  %tmp271 = getelementptr inbounds float* %tmp270, i64 1
+  %tmp272 = getelementptr inbounds float* %tmp271, i64 1
+  %tmp273 = getelementptr inbounds float* %tmp272, i64 1
+  %tmp274 = getelementptr inbounds float* %tmp273, i64 1
+  %tmp275 = getelementptr inbounds float* %tmp274, i64 1
+  %tmp276 = getelementptr inbounds float* %tmp275, i64 1
+  %tmp277 = getelementptr inbounds float* %tmp276, i64 1
+  %tmp278 = getelementptr inbounds float* %tmp277, i64 1
+  %tmp279 = getelementptr inbounds float* %tmp278, i64 1
+  %tmp280 = getelementptr inbounds float* %tmp279, i64 1
+  %tmp281 = getelementptr inbounds float* %tmp280, i64 1
+  %tmp282 = getelementptr inbounds float* %tmp281, i64 1
+  %tmp283 = getelementptr inbounds float* %tmp282, i64 1
+  %tmp284 = getelementptr inbounds float* %tmp283, i64 1
+  %tmp285 = getelementptr inbounds float* %tmp284, i64 1
+  %tmp286 = getelementptr inbounds float* %tmp285, i64 1
+  %tmp287 = getelementptr inbounds float* %tmp286, i64 1
+  %tmp288 = getelementptr inbounds float* %tmp287, i64 1
+  %tmp289 = getelementptr inbounds float* %tmp288, i64 1
+  %tmp290 = getelementptr inbounds float* %tmp289, i64 1
+  %tmp291 = getelementptr inbounds float* %tmp290, i64 1
+  %tmp292 = getelementptr inbounds float* %tmp291, i64 1
+  %tmp293 = getelementptr inbounds float* %tmp292, i64 1
+  %tmp294 = getelementptr inbounds float* %tmp293, i64 1
+  %tmp295 = getelementptr inbounds float* %tmp294, i64 1
+  %tmp296 = getelementptr inbounds float* %tmp295, i64 1
+  %tmp297 = getelementptr inbounds float* %tmp296, i64 1
+  %tmp298 = getelementptr inbounds float* %tmp297, i64 1
+  %tmp299 = getelementptr inbounds float* %tmp298, i64 1
+  %tmp300 = getelementptr inbounds float* %tmp299, i64 1
+  %tmp301 = getelementptr inbounds float* %tmp300, i64 1
+  %tmp302 = getelementptr inbounds float* %tmp301, i64 1
+  %tmp303 = getelementptr inbounds float* %tmp302, i64 1
+  %tmp304 = getelementptr inbounds float* %tmp303, i64 1
+  %tmp305 = getelementptr inbounds float* %tmp304, i64 1
+  %tmp306 = getelementptr inbounds float* %tmp305, i64 1
+  %tmp307 = getelementptr inbounds float* %tmp306, i64 1
+  %tmp308 = getelementptr inbounds float* %tmp307, i64 1
+  %tmp309 = getelementptr inbounds float* %tmp308, i64 1
+  %tmp310 = getelementptr inbounds float* %tmp309, i64 1
+  %tmp311 = getelementptr inbounds float* %tmp310, i64 1
+  %tmp312 = getelementptr inbounds float* %tmp311, i64 1
+  %tmp313 = getelementptr inbounds float* %tmp312, i64 1
+  %tmp314 = getelementptr inbounds float* %tmp313, i64 1
+  %tmp315 = getelementptr inbounds float* %tmp314, i64 1
+  %tmp316 = getelementptr inbounds float* %tmp315, i64 1
+  %tmp317 = getelementptr inbounds float* %tmp316, i64 1
+  %tmp318 = getelementptr inbounds float* %tmp317, i64 1
+  %tmp319 = getelementptr inbounds float* %tmp318, i64 1
+  %tmp320 = getelementptr inbounds float* %tmp319, i64 1
+  %tmp321 = getelementptr inbounds float* %tmp320, i64 1
+  %tmp322 = getelementptr inbounds float* %tmp321, i64 1
+  %tmp323 = getelementptr inbounds float* %tmp322, i64 1
+  %tmp324 = getelementptr inbounds float* %tmp323, i64 1
+  %tmp325 = getelementptr inbounds float* %tmp324, i64 1
+  %tmp326 = getelementptr inbounds float* %tmp325, i64 1
+  %tmp327 = getelementptr inbounds float* %tmp326, i64 1
+  %tmp328 = getelementptr inbounds float* %tmp327, i64 1
+  %tmp329 = getelementptr inbounds float* %tmp328, i64 1
+  %tmp330 = getelementptr inbounds float* %tmp329, i64 1
+  %tmp331 = getelementptr inbounds float* %tmp330, i64 1
+  %tmp332 = getelementptr inbounds float* %tmp331, i64 1
+  %tmp333 = getelementptr inbounds float* %tmp332, i64 1
+  %tmp334 = getelementptr inbounds float* %tmp333, i64 1
+  %tmp335 = getelementptr inbounds float* %tmp334, i64 1
+  %tmp336 = getelementptr inbounds float* %tmp335, i64 1
+  %tmp337 = getelementptr inbounds float* %tmp336, i64 1
+  %tmp338 = getelementptr inbounds float* %tmp337, i64 1
+  %tmp339 = getelementptr inbounds float* %tmp338, i64 1
+  %tmp340 = getelementptr inbounds float* %tmp339, i64 1
+  %tmp341 = getelementptr inbounds float* %tmp340, i64 1
+  %tmp342 = getelementptr inbounds float* %tmp341, i64 1
+  %tmp343 = getelementptr inbounds float* %tmp342, i64 1
+  %tmp344 = getelementptr inbounds float* %tmp343, i64 1
+  %tmp345 = getelementptr inbounds float* %tmp344, i64 1
+  %tmp346 = getelementptr inbounds float* %tmp345, i64 1
+  %tmp347 = getelementptr inbounds float* %tmp346, i64 1
+  %tmp348 = getelementptr inbounds float* %tmp347, i64 1
+  %tmp349 = getelementptr inbounds float* %tmp348, i64 1
+  %tmp350 = getelementptr inbounds float* %tmp349, i64 1
+  %tmp351 = getelementptr inbounds float* %tmp350, i64 1
+  %tmp352 = getelementptr inbounds float* %tmp351, i64 1
+  %tmp353 = getelementptr inbounds float* %tmp352, i64 1
+  %tmp354 = getelementptr inbounds float* %tmp353, i64 1
+  %tmp355 = getelementptr inbounds float* %tmp354, i64 1
+  %tmp356 = getelementptr inbounds float* %tmp355, i64 1
+  %tmp357 = getelementptr inbounds float* %tmp356, i64 1
+  %tmp358 = getelementptr inbounds float* %tmp357, i64 1
+  %tmp359 = getelementptr inbounds float* %tmp358, i64 1
+  %tmp360 = getelementptr inbounds float* %tmp359, i64 1
+  %tmp361 = getelementptr inbounds float* %tmp360, i64 1
+  %tmp362 = getelementptr inbounds float* %tmp361, i64 1
+  %tmp363 = getelementptr inbounds float* %tmp362, i64 1
+  %tmp364 = getelementptr inbounds float* %tmp363, i64 1
+  %tmp365 = getelementptr inbounds float* %tmp364, i64 1
+  %tmp366 = getelementptr inbounds float* %tmp365, i64 1
+  %tmp367 = getelementptr inbounds float* %tmp366, i64 1
+  %tmp368 = getelementptr inbounds float* %tmp367, i64 1
+  %tmp369 = getelementptr inbounds float* %tmp368, i64 1
+  %tmp370 = getelementptr inbounds float* %tmp369, i64 1
+  %tmp371 = getelementptr inbounds float* %tmp370, i64 1
+  %tmp372 = getelementptr inbounds float* %tmp371, i64 1
+  %tmp373 = getelementptr inbounds float* %tmp372, i64 1
+  %tmp374 = getelementptr inbounds float* %tmp373, i64 1
+  %tmp375 = getelementptr inbounds float* %tmp374, i64 1
+  %tmp376 = getelementptr inbounds float* %tmp375, i64 1
+  %tmp377 = getelementptr inbounds float* %tmp376, i64 1
+  %tmp378 = getelementptr inbounds float* %tmp377, i64 1
+  %tmp379 = getelementptr inbounds float* %tmp378, i64 1
+  %tmp380 = getelementptr inbounds float* %tmp379, i64 1
+  %tmp381 = getelementptr inbounds float* %tmp380, i64 1
+  %tmp382 = getelementptr inbounds float* %tmp381, i64 1
+  %tmp383 = getelementptr inbounds float* %tmp382, i64 1
+  %tmp384 = getelementptr inbounds float* %tmp383, i64 1
+  %tmp385 = getelementptr inbounds float* %tmp384, i64 1
+  %tmp386 = getelementptr inbounds float* %tmp385, i64 1
+  %tmp387 = getelementptr inbounds float* %tmp386, i64 1
+  %tmp388 = getelementptr inbounds float* %tmp387, i64 1
+  %tmp389 = getelementptr inbounds float* %tmp388, i64 1
+  %tmp390 = getelementptr inbounds float* %tmp389, i64 1
+  %tmp391 = getelementptr inbounds float* %tmp390, i64 1
+  %tmp392 = getelementptr inbounds float* %tmp391, i64 1
+  %tmp393 = getelementptr inbounds float* %tmp392, i64 1
+  %tmp394 = getelementptr inbounds float* %tmp393, i64 1
+  %tmp395 = getelementptr inbounds float* %tmp394, i64 1
+  %tmp396 = getelementptr inbounds float* %tmp395, i64 1
+  %tmp397 = getelementptr inbounds float* %tmp396, i64 1
+  %tmp398 = getelementptr inbounds float* %tmp397, i64 1
+  %tmp399 = getelementptr inbounds float* %tmp398, i64 1
+  %tmp400 = getelementptr inbounds float* %tmp399, i64 1
+  %tmp401 = getelementptr inbounds float* %tmp400, i64 1
+  %tmp402 = getelementptr inbounds float* %tmp401, i64 1
+  %tmp403 = getelementptr inbounds float* %tmp402, i64 1
+  %tmp404 = getelementptr inbounds float* %tmp403, i64 1
+  %tmp405 = getelementptr inbounds float* %tmp404, i64 1
+  %tmp406 = getelementptr inbounds float* %tmp405, i64 1
+  %tmp407 = getelementptr inbounds float* %tmp406, i64 1
+  %tmp408 = getelementptr inbounds float* %tmp407, i64 1
+  %tmp409 = getelementptr inbounds float* %tmp408, i64 1
+  %tmp410 = getelementptr inbounds float* %tmp409, i64 1
+  %tmp411 = getelementptr inbounds float* %tmp410, i64 1
+  %tmp412 = getelementptr inbounds float* %tmp411, i64 1
+  %tmp413 = getelementptr inbounds float* %tmp412, i64 1
+  %tmp414 = getelementptr inbounds float* %tmp413, i64 1
+  %tmp415 = getelementptr inbounds float* %tmp414, i64 1
+  %tmp416 = getelementptr inbounds float* %tmp415, i64 1
+  %tmp417 = getelementptr inbounds float* %tmp416, i64 1
+  %tmp418 = getelementptr inbounds float* %tmp417, i64 1
+  %tmp419 = getelementptr inbounds float* %tmp418, i64 1
+  %tmp420 = getelementptr inbounds float* %tmp419, i64 1
+  %tmp421 = getelementptr inbounds float* %tmp420, i64 1
+  %tmp422 = getelementptr inbounds float* %tmp421, i64 1
+  %tmp423 = getelementptr inbounds float* %tmp422, i64 1
+  %tmp424 = getelementptr inbounds float* %tmp423, i64 1
+  %tmp425 = getelementptr inbounds float* %tmp424, i64 1
+  %tmp426 = getelementptr inbounds float* %tmp425, i64 1
+  %tmp427 = getelementptr inbounds float* %tmp426, i64 1
+  %tmp428 = getelementptr inbounds float* %tmp427, i64 1
+  %tmp429 = getelementptr inbounds float* %tmp428, i64 1
+  %tmp430 = getelementptr inbounds float* %tmp429, i64 1
+  %tmp431 = getelementptr inbounds float* %tmp430, i64 1
+  %tmp432 = getelementptr inbounds float* %tmp431, i64 1
+  %tmp433 = getelementptr inbounds float* %tmp432, i64 1
+  %tmp434 = getelementptr inbounds float* %tmp433, i64 1
+  %tmp435 = getelementptr inbounds float* %tmp434, i64 1
+  %tmp436 = getelementptr inbounds float* %tmp435, i64 1
+  %tmp437 = getelementptr inbounds float* %tmp436, i64 1
+  %tmp438 = getelementptr inbounds float* %tmp437, i64 1
+  %tmp439 = getelementptr inbounds float* %tmp438, i64 1
+  %tmp440 = getelementptr inbounds float* %tmp439, i64 1
+  %tmp441 = getelementptr inbounds float* %tmp440, i64 1
+  %tmp442 = getelementptr inbounds float* %tmp441, i64 1
+  %tmp443 = getelementptr inbounds float* %tmp442, i64 1
+  %tmp444 = getelementptr inbounds float* %tmp443, i64 1
+  %tmp445 = getelementptr inbounds float* %tmp444, i64 1
+  %tmp446 = getelementptr inbounds float* %tmp445, i64 1
+  %tmp447 = getelementptr inbounds float* %tmp446, i64 1
+  %tmp448 = getelementptr inbounds float* %tmp447, i64 1
+  %tmp449 = getelementptr inbounds float* %tmp448, i64 1
+  %tmp450 = getelementptr inbounds float* %tmp449, i64 1
+  %tmp451 = getelementptr inbounds float* %tmp450, i64 1
+  %tmp452 = getelementptr inbounds float* %tmp451, i64 1
+  %tmp453 = getelementptr inbounds float* %tmp452, i64 1
+  %tmp454 = getelementptr inbounds float* %tmp453, i64 1
+  %tmp455 = getelementptr inbounds float* %tmp454, i64 1
+  %tmp456 = getelementptr inbounds float* %tmp455, i64 1
+  %tmp457 = getelementptr inbounds float* %tmp456, i64 1
+  %tmp458 = getelementptr inbounds float* %tmp457, i64 1
+  %tmp459 = getelementptr inbounds float* %tmp458, i64 1
+  %tmp460 = getelementptr inbounds float* %tmp459, i64 1
+  %tmp461 = getelementptr inbounds float* %tmp460, i64 1
+  %tmp462 = getelementptr inbounds float* %tmp461, i64 1
+  %tmp463 = getelementptr inbounds float* %tmp462, i64 1
+  %tmp464 = getelementptr inbounds float* %tmp463, i64 1
+  %tmp465 = getelementptr inbounds float* %tmp464, i64 1
+  %tmp466 = getelementptr inbounds float* %tmp465, i64 1
+  %tmp467 = getelementptr inbounds float* %tmp466, i64 1
+  %tmp468 = getelementptr inbounds float* %tmp467, i64 1
+  %tmp469 = getelementptr inbounds float* %tmp468, i64 1
+  %tmp470 = getelementptr inbounds float* %tmp469, i64 1
+  %tmp471 = getelementptr inbounds float* %tmp470, i64 1
+  %tmp472 = getelementptr inbounds float* %tmp471, i64 1
+  %tmp473 = getelementptr inbounds float* %tmp472, i64 1
+  %tmp474 = getelementptr inbounds float* %tmp473, i64 1
+  %tmp475 = getelementptr inbounds float* %tmp474, i64 1
+  %tmp476 = getelementptr inbounds float* %tmp475, i64 1
+  %tmp477 = getelementptr inbounds float* %tmp476, i64 1
+  %tmp478 = getelementptr inbounds float* %tmp477, i64 1
+  %tmp479 = getelementptr inbounds float* %tmp478, i64 1
+  %tmp480 = getelementptr inbounds float* %tmp479, i64 1
+  %tmp481 = getelementptr inbounds float* %tmp480, i64 1
+  %tmp482 = getelementptr inbounds float* %tmp481, i64 1
+  %tmp483 = getelementptr inbounds float* %tmp482, i64 1
+  %tmp484 = getelementptr inbounds float* %tmp483, i64 1
+  %tmp485 = getelementptr inbounds float* %tmp484, i64 1
+  %tmp486 = getelementptr inbounds float* %tmp485, i64 1
+  %tmp487 = getelementptr inbounds float* %tmp486, i64 1
+  %tmp488 = getelementptr inbounds float* %tmp487, i64 1
+  %tmp489 = getelementptr inbounds float* %tmp488, i64 1
+  %tmp490 = getelementptr inbounds float* %tmp489, i64 1
+  %tmp491 = getelementptr inbounds float* %tmp490, i64 1
+  %tmp492 = getelementptr inbounds float* %tmp491, i64 1
+  %tmp493 = getelementptr inbounds float* %tmp492, i64 1
+  %tmp494 = getelementptr inbounds float* %tmp493, i64 1
+  %tmp495 = getelementptr inbounds float* %tmp494, i64 1
+  %tmp496 = getelementptr inbounds float* %tmp495, i64 1
+  %tmp497 = getelementptr inbounds float* %tmp496, i64 1
+  %tmp498 = getelementptr inbounds float* %tmp497, i64 1
+  %tmp499 = getelementptr inbounds float* %tmp498, i64 1
+  %tmp500 = getelementptr inbounds float* %tmp499, i64 1
+  %tmp501 = getelementptr inbounds float* %tmp500, i64 1
+  %tmp502 = getelementptr inbounds float* %tmp501, i64 1
+  %tmp503 = getelementptr inbounds float* %tmp502, i64 1
+  %tmp504 = getelementptr inbounds float* %tmp503, i64 1
+  %tmp505 = getelementptr inbounds float* %tmp504, i64 1
+  %tmp506 = getelementptr inbounds float* %tmp505, i64 1
+  %tmp507 = getelementptr inbounds float* %tmp506, i64 1
+  %tmp508 = getelementptr inbounds float* %tmp507, i64 1
+  %tmp509 = getelementptr inbounds float* %tmp508, i64 1
+  %tmp510 = getelementptr inbounds float* %tmp509, i64 1
+  %tmp511 = getelementptr inbounds float* %tmp510, i64 1
+  %tmp512 = getelementptr inbounds float* %tmp511, i64 1
+  %tmp513 = getelementptr inbounds float* %tmp512, i64 1
+  %tmp514 = getelementptr inbounds float* %tmp513, i64 1
+  %tmp515 = getelementptr inbounds float* %tmp514, i64 1
+  %tmp516 = getelementptr inbounds float* %tmp515, i64 1
+  %tmp517 = getelementptr inbounds float* %tmp516, i64 1
+  %tmp518 = getelementptr inbounds float* %tmp517, i64 1
+  %tmp519 = getelementptr inbounds float* %tmp518, i64 1
+  %tmp520 = getelementptr inbounds float* %tmp519, i64 1
+  %tmp521 = getelementptr inbounds float* %tmp520, i64 1
+  %tmp522 = getelementptr inbounds float* %tmp521, i64 1
+  %tmp523 = getelementptr inbounds float* %tmp522, i64 1
+  %tmp524 = getelementptr inbounds float* %tmp523, i64 1
+  %tmp525 = getelementptr inbounds float* %tmp524, i64 1
+  %tmp526 = getelementptr inbounds float* %tmp525, i64 1
+  %tmp527 = getelementptr inbounds float* %tmp526, i64 1
+  %tmp528 = getelementptr inbounds float* %tmp527, i64 1
+  %tmp529 = getelementptr inbounds float* %tmp528, i64 1
+  %tmp530 = getelementptr inbounds float* %tmp529, i64 1
+  %tmp531 = getelementptr inbounds float* %tmp530, i64 1
+  %tmp532 = getelementptr inbounds float* %tmp531, i64 1
+  %tmp533 = getelementptr inbounds float* %tmp532, i64 1
+  %tmp534 = getelementptr inbounds float* %tmp533, i64 1
+  %tmp535 = getelementptr inbounds float* %tmp534, i64 1
+  %tmp536 = getelementptr inbounds float* %tmp535, i64 1
+  %tmp537 = getelementptr inbounds float* %tmp536, i64 1
+  %tmp538 = getelementptr inbounds float* %tmp537, i64 1
+  %tmp539 = getelementptr inbounds float* %tmp538, i64 1
+  %tmp540 = getelementptr inbounds float* %tmp539, i64 1
+  %tmp541 = getelementptr inbounds float* %tmp540, i64 1
+  %tmp542 = getelementptr inbounds float* %tmp541, i64 1
+  %tmp543 = getelementptr inbounds float* %tmp542, i64 1
+  %tmp544 = getelementptr inbounds float* %tmp543, i64 1
+  %tmp545 = getelementptr inbounds float* %tmp544, i64 1
+  %tmp546 = getelementptr inbounds float* %tmp545, i64 1
+  %tmp547 = getelementptr inbounds float* %tmp546, i64 1
+  %tmp548 = getelementptr inbounds float* %tmp547, i64 1
+  %tmp549 = getelementptr inbounds float* %tmp548, i64 1
+  %tmp550 = getelementptr inbounds float* %tmp549, i64 1
+  %tmp551 = getelementptr inbounds float* %tmp550, i64 1
+  %tmp552 = getelementptr inbounds float* %tmp551, i64 1
+  %tmp553 = getelementptr inbounds float* %tmp552, i64 1
+  %tmp554 = getelementptr inbounds float* %tmp553, i64 1
+  %tmp555 = getelementptr inbounds float* %tmp554, i64 1
+  %tmp556 = getelementptr inbounds float* %tmp555, i64 1
+  %tmp557 = getelementptr inbounds float* %tmp556, i64 1
+  %tmp558 = getelementptr inbounds float* %tmp557, i64 1
+  %tmp559 = getelementptr inbounds float* %tmp558, i64 1
+  %tmp560 = getelementptr inbounds float* %tmp559, i64 1
+  %tmp561 = getelementptr inbounds float* %tmp560, i64 1
+  %tmp562 = getelementptr inbounds float* %tmp561, i64 1
+  %tmp563 = getelementptr inbounds float* %tmp562, i64 1
+  %tmp564 = getelementptr inbounds float* %tmp563, i64 1
+  %tmp565 = getelementptr inbounds float* %tmp564, i64 1
+  %tmp566 = getelementptr inbounds float* %tmp565, i64 1
+  %tmp567 = getelementptr inbounds float* %tmp566, i64 1
+  %tmp568 = getelementptr inbounds float* %tmp567, i64 1
+  %tmp569 = getelementptr inbounds float* %tmp568, i64 1
+  %tmp570 = getelementptr inbounds float* %tmp569, i64 1
+  %tmp571 = getelementptr inbounds float* %tmp570, i64 1
+  %tmp572 = getelementptr inbounds float* %tmp571, i64 1
+  %tmp573 = getelementptr inbounds float* %tmp572, i64 1
+  %tmp574 = getelementptr inbounds float* %tmp573, i64 1
+  %tmp575 = getelementptr inbounds float* %tmp574, i64 1
+  %tmp576 = getelementptr inbounds float* %tmp575, i64 1
+  %tmp577 = getelementptr inbounds float* %tmp576, i64 1
+  %tmp578 = getelementptr inbounds float* %tmp577, i64 1
+  %tmp579 = getelementptr inbounds float* %tmp578, i64 1
+  %tmp580 = getelementptr inbounds float* %tmp579, i64 1
+  %tmp581 = getelementptr inbounds float* %tmp580, i64 1
+  %tmp582 = getelementptr inbounds float* %tmp581, i64 1
+  %tmp583 = getelementptr inbounds float* %tmp582, i64 1
+  %tmp584 = getelementptr inbounds float* %tmp583, i64 1
+  %tmp585 = getelementptr inbounds float* %tmp584, i64 1
+  %tmp586 = getelementptr inbounds float* %tmp585, i64 1
+  %tmp587 = getelementptr inbounds float* %tmp586, i64 1
+  %tmp588 = getelementptr inbounds float* %tmp587, i64 1
+  %tmp589 = getelementptr inbounds float* %tmp588, i64 1
+  %tmp590 = getelementptr inbounds float* %tmp589, i64 1
+  %tmp591 = getelementptr inbounds float* %tmp590, i64 1
+  %tmp592 = getelementptr inbounds float* %tmp591, i64 1
+  %tmp593 = getelementptr inbounds float* %tmp592, i64 1
+  %tmp594 = getelementptr inbounds float* %tmp593, i64 1
+  %tmp595 = getelementptr inbounds float* %tmp594, i64 1
+  %tmp596 = getelementptr inbounds float* %tmp595, i64 1
+  %tmp597 = getelementptr inbounds float* %tmp596, i64 1
+  %tmp598 = getelementptr inbounds float* %tmp597, i64 1
+  %tmp599 = getelementptr inbounds float* %tmp598, i64 1
+  %tmp600 = getelementptr inbounds float* %tmp599, i64 1
+  %tmp601 = getelementptr inbounds float* %tmp600, i64 1
+  %tmp602 = getelementptr inbounds float* %tmp601, i64 1
+  %tmp603 = getelementptr inbounds float* %tmp602, i64 1
+  %tmp604 = getelementptr inbounds float* %tmp603, i64 1
+  %tmp605 = getelementptr inbounds float* %tmp604, i64 1
+  %tmp606 = getelementptr inbounds float* %tmp605, i64 1
+  %tmp607 = getelementptr inbounds float* %tmp606, i64 1
+  %tmp608 = getelementptr inbounds float* %tmp607, i64 1
+  %tmp609 = getelementptr inbounds float* %tmp608, i64 1
+  %tmp610 = getelementptr inbounds float* %tmp609, i64 1
+  %tmp611 = getelementptr inbounds float* %tmp610, i64 1
+  %tmp612 = getelementptr inbounds float* %tmp611, i64 1
+  %tmp613 = getelementptr inbounds float* %tmp612, i64 1
+  %tmp614 = getelementptr inbounds float* %tmp613, i64 1
+  %tmp615 = getelementptr inbounds float* %tmp614, i64 1
+  %tmp616 = getelementptr inbounds float* %tmp615, i64 1
+  %tmp617 = getelementptr inbounds float* %tmp616, i64 1
+  %tmp618 = getelementptr inbounds float* %tmp617, i64 1
+  %tmp619 = getelementptr inbounds float* %tmp618, i64 1
+  %tmp620 = getelementptr inbounds float* %tmp619, i64 1
+  %tmp621 = getelementptr inbounds float* %tmp620, i64 1
+  %tmp622 = getelementptr inbounds float* %tmp621, i64 1
+  %tmp623 = getelementptr inbounds float* %tmp622, i64 1
+  %tmp624 = getelementptr inbounds float* %tmp623, i64 1
+  %tmp625 = getelementptr inbounds float* %tmp624, i64 1
+  %tmp626 = getelementptr inbounds float* %tmp625, i64 1
+  %tmp627 = getelementptr inbounds float* %tmp626, i64 1
+  %tmp628 = getelementptr inbounds float* %tmp627, i64 1
+  %tmp629 = getelementptr inbounds float* %tmp628, i64 1
+  %tmp630 = getelementptr inbounds float* %tmp629, i64 1
+  %tmp631 = getelementptr inbounds float* %tmp630, i64 1
+  %tmp632 = getelementptr inbounds float* %tmp631, i64 1
+  %tmp633 = getelementptr inbounds float* %tmp632, i64 1
+  %tmp634 = getelementptr inbounds float* %tmp633, i64 1
+  %tmp635 = getelementptr inbounds float* %tmp634, i64 1
+  %tmp636 = getelementptr inbounds float* %tmp635, i64 1
+  %tmp637 = getelementptr inbounds float* %tmp636, i64 1
+  %tmp638 = getelementptr inbounds float* %tmp637, i64 1
+  %tmp639 = getelementptr inbounds float* %tmp638, i64 1
+  %tmp640 = getelementptr inbounds float* %tmp639, i64 1
+  %tmp641 = getelementptr inbounds float* %tmp640, i64 1
+  %tmp642 = getelementptr inbounds float* %tmp641, i64 1
+  %tmp643 = getelementptr inbounds float* %tmp642, i64 1
+  %tmp644 = getelementptr inbounds float* %tmp643, i64 1
+  %tmp645 = getelementptr inbounds float* %tmp644, i64 1
+  %tmp646 = getelementptr inbounds float* %tmp645, i64 1
+  %tmp647 = getelementptr inbounds float* %tmp646, i64 1
+  %tmp648 = getelementptr inbounds float* %tmp647, i64 1
+  %tmp649 = getelementptr inbounds float* %tmp648, i64 1
+  %tmp650 = getelementptr inbounds float* %tmp649, i64 1
+  %tmp651 = getelementptr inbounds float* %tmp650, i64 1
+  %tmp652 = getelementptr inbounds float* %tmp651, i64 1
+  %tmp653 = getelementptr inbounds float* %tmp652, i64 1
+  %tmp654 = getelementptr inbounds float* %tmp653, i64 1
+  %tmp655 = getelementptr inbounds float* %tmp654, i64 1
+  %tmp656 = getelementptr inbounds float* %tmp655, i64 1
+  %tmp657 = getelementptr inbounds float* %tmp656, i64 1
+  %tmp658 = getelementptr inbounds float* %tmp657, i64 1
+  %tmp659 = getelementptr inbounds float* %tmp658, i64 1
+  %tmp660 = getelementptr inbounds float* %tmp659, i64 1
+  %tmp661 = getelementptr inbounds float* %tmp660, i64 1
+  %tmp662 = getelementptr inbounds float* %tmp661, i64 1
+  %tmp663 = getelementptr inbounds float* %tmp662, i64 1
+  %tmp664 = getelementptr inbounds float* %tmp663, i64 1
+  %tmp665 = getelementptr inbounds float* %tmp664, i64 1
+  %tmp666 = getelementptr inbounds float* %tmp665, i64 1
+  %tmp667 = getelementptr inbounds float* %tmp666, i64 1
+  %tmp668 = getelementptr inbounds float* %tmp667, i64 1
+  %tmp669 = getelementptr inbounds float* %tmp668, i64 1
+  %tmp670 = getelementptr inbounds float* %tmp669, i64 1
+  %tmp671 = getelementptr inbounds float* %tmp670, i64 1
+  %tmp672 = getelementptr inbounds float* %tmp671, i64 1
+  %tmp673 = getelementptr inbounds float* %tmp672, i64 1
+  %tmp674 = getelementptr inbounds float* %tmp673, i64 1
+  %tmp675 = getelementptr inbounds float* %tmp674, i64 1
+  %tmp676 = getelementptr inbounds float* %tmp675, i64 1
+  %tmp677 = getelementptr inbounds float* %tmp676, i64 1
+  %tmp678 = getelementptr inbounds float* %tmp677, i64 1
+  %tmp679 = getelementptr inbounds float* %tmp678, i64 1
+  %tmp680 = getelementptr inbounds float* %tmp679, i64 1
+  %tmp681 = getelementptr inbounds float* %tmp680, i64 1
+  %tmp682 = getelementptr inbounds float* %tmp681, i64 1
+  %tmp683 = getelementptr inbounds float* %tmp682, i64 1
+  %tmp684 = getelementptr inbounds float* %tmp683, i64 1
+  %tmp685 = getelementptr inbounds float* %tmp684, i64 1
+  %tmp686 = getelementptr inbounds float* %tmp685, i64 1
+  %tmp687 = getelementptr inbounds float* %tmp686, i64 1
+  %tmp688 = getelementptr inbounds float* %tmp687, i64 1
+  %tmp689 = getelementptr inbounds float* %tmp688, i64 1
+  %tmp690 = getelementptr inbounds float* %tmp689, i64 1
+  %tmp691 = getelementptr inbounds float* %tmp690, i64 1
+  %tmp692 = getelementptr inbounds float* %tmp691, i64 1
+  %tmp693 = getelementptr inbounds float* %tmp692, i64 1
+  %tmp694 = getelementptr inbounds float* %tmp693, i64 1
+  %tmp695 = getelementptr inbounds float* %tmp694, i64 1
+  %tmp696 = getelementptr inbounds float* %tmp695, i64 1
+  %tmp697 = getelementptr inbounds float* %tmp696, i64 1
+  %tmp698 = getelementptr inbounds float* %tmp697, i64 1
+  %tmp699 = getelementptr inbounds float* %tmp698, i64 1
+  %tmp700 = getelementptr inbounds float* %tmp699, i64 1
+  %tmp701 = getelementptr inbounds float* %tmp700, i64 1
+  %tmp702 = getelementptr inbounds float* %tmp701, i64 1
+  %tmp703 = getelementptr inbounds float* %tmp702, i64 1
+  %tmp704 = getelementptr inbounds float* %tmp703, i64 1
+  %tmp705 = getelementptr inbounds float* %tmp704, i64 1
+  %tmp706 = getelementptr inbounds float* %tmp705, i64 1
+  %tmp707 = getelementptr inbounds float* %tmp706, i64 1
+  %tmp708 = getelementptr inbounds float* %tmp707, i64 1
+  %tmp709 = getelementptr inbounds float* %tmp708, i64 1
+  %tmp710 = getelementptr inbounds float* %tmp709, i64 1
+  %tmp711 = getelementptr inbounds float* %tmp710, i64 1
+  %tmp712 = getelementptr inbounds float* %tmp711, i64 1
+  %tmp713 = getelementptr inbounds float* %tmp712, i64 1
+  %tmp714 = getelementptr inbounds float* %tmp713, i64 1
+  %tmp715 = getelementptr inbounds float* %tmp714, i64 1
+  %tmp716 = getelementptr inbounds float* %tmp715, i64 1
+  %tmp717 = getelementptr inbounds float* %tmp716, i64 1
+  %tmp718 = getelementptr inbounds float* %tmp717, i64 1
+  %tmp719 = getelementptr inbounds float* %tmp718, i64 1
+  %tmp720 = getelementptr inbounds float* %tmp719, i64 1
+  %tmp721 = getelementptr inbounds float* %tmp720, i64 1
+  %tmp722 = getelementptr inbounds float* %tmp721, i64 1
+  %tmp723 = getelementptr inbounds float* %tmp722, i64 1
+  %tmp724 = getelementptr inbounds float* %tmp723, i64 1
+  %tmp725 = getelementptr inbounds float* %tmp724, i64 1
+  %tmp726 = getelementptr inbounds float* %tmp725, i64 1
+  %tmp727 = getelementptr inbounds float* %tmp726, i64 1
+  %tmp728 = getelementptr inbounds float* %tmp727, i64 1
+  %tmp729 = getelementptr inbounds float* %tmp728, i64 1
+  %tmp730 = getelementptr inbounds float* %tmp729, i64 1
+  %tmp731 = getelementptr inbounds float* %tmp730, i64 1
+  %tmp732 = getelementptr inbounds float* %tmp731, i64 1
+  %tmp733 = getelementptr inbounds float* %tmp732, i64 1
+  %tmp734 = getelementptr inbounds float* %tmp733, i64 1
+  %tmp735 = getelementptr inbounds float* %tmp734, i64 1
+  %tmp736 = getelementptr inbounds float* %tmp735, i64 1
+  %tmp737 = getelementptr inbounds float* %tmp736, i64 1
+  %tmp738 = getelementptr inbounds float* %tmp737, i64 1
+  %tmp739 = getelementptr inbounds float* %tmp738, i64 1
+  %tmp740 = getelementptr inbounds float* %tmp739, i64 1
+  %tmp741 = getelementptr inbounds float* %tmp740, i64 1
+  %tmp742 = getelementptr inbounds float* %tmp741, i64 1
+  %tmp743 = getelementptr inbounds float* %tmp742, i64 1
+  %tmp744 = getelementptr inbounds float* %tmp743, i64 1
+  %tmp745 = getelementptr inbounds float* %tmp744, i64 1
+  %tmp746 = getelementptr inbounds float* %tmp745, i64 1
+  %tmp747 = getelementptr inbounds float* %tmp746, i64 1
+  %tmp748 = getelementptr inbounds float* %tmp747, i64 1
+  %tmp749 = getelementptr inbounds float* %tmp748, i64 1
+  %tmp750 = getelementptr inbounds float* %tmp749, i64 1
+  %tmp751 = getelementptr inbounds float* %tmp750, i64 1
+  %tmp752 = getelementptr inbounds float* %tmp751, i64 1
+  %tmp753 = getelementptr inbounds float* %tmp752, i64 1
+  %tmp754 = getelementptr inbounds float* %tmp753, i64 1
+  %tmp755 = getelementptr inbounds float* %tmp754, i64 1
+  %tmp756 = getelementptr inbounds float* %tmp755, i64 1
+  %tmp757 = getelementptr inbounds float* %tmp756, i64 1
+  %tmp758 = getelementptr inbounds float* %tmp757, i64 1
+  %tmp759 = getelementptr inbounds float* %tmp758, i64 1
+  %tmp760 = getelementptr inbounds float* %tmp759, i64 1
+  %tmp761 = getelementptr inbounds float* %tmp760, i64 1
+  %tmp762 = getelementptr inbounds float* %tmp761, i64 1
+  %tmp763 = getelementptr inbounds float* %tmp762, i64 1
+  %tmp764 = getelementptr inbounds float* %tmp763, i64 1
+  %tmp765 = getelementptr inbounds float* %tmp764, i64 1
+  %tmp766 = getelementptr inbounds float* %tmp765, i64 1
+  %tmp767 = getelementptr inbounds float* %tmp766, i64 1
+  %tmp768 = getelementptr inbounds float* %tmp767, i64 1
+  %tmp769 = getelementptr inbounds float* %tmp768, i64 1
+  %tmp770 = getelementptr inbounds float* %tmp769, i64 1
+  %tmp771 = getelementptr inbounds float* %tmp770, i64 1
+  %tmp772 = getelementptr inbounds float* %tmp771, i64 1
+  %tmp773 = getelementptr inbounds float* %tmp772, i64 1
+  %tmp774 = getelementptr inbounds float* %tmp773, i64 1
+  %tmp775 = getelementptr inbounds float* %tmp774, i64 1
+  %tmp776 = getelementptr inbounds float* %tmp775, i64 1
+  %tmp777 = getelementptr inbounds float* %tmp776, i64 1
+  %tmp778 = getelementptr inbounds float* %tmp777, i64 1
+  %tmp779 = getelementptr inbounds float* %tmp778, i64 1
+  %tmp780 = getelementptr inbounds float* %tmp779, i64 1
+  %tmp781 = getelementptr inbounds float* %tmp780, i64 1
+  %tmp782 = getelementptr inbounds float* %tmp781, i64 1
+  %tmp783 = getelementptr inbounds float* %tmp782, i64 1
+  %tmp784 = getelementptr inbounds float* %tmp783, i64 1
+  %tmp785 = getelementptr inbounds float* %tmp784, i64 1
+  %tmp786 = getelementptr inbounds float* %tmp785, i64 1
+  %tmp787 = getelementptr inbounds float* %tmp786, i64 1
+  %tmp788 = getelementptr inbounds float* %tmp787, i64 1
+  %tmp789 = getelementptr inbounds float* %tmp788, i64 1
+  %tmp790 = getelementptr inbounds float* %tmp789, i64 1
+  %tmp791 = getelementptr inbounds float* %tmp790, i64 1
+  %tmp792 = getelementptr inbounds float* %tmp791, i64 1
+  %tmp793 = getelementptr inbounds float* %tmp792, i64 1
+  %tmp794 = getelementptr inbounds float* %tmp793, i64 1
+  %tmp795 = getelementptr inbounds float* %tmp794, i64 1
+  %tmp796 = getelementptr inbounds float* %tmp795, i64 1
+  %tmp797 = getelementptr inbounds float* %tmp796, i64 1
+  %tmp798 = getelementptr inbounds float* %tmp797, i64 1
+  %tmp799 = getelementptr inbounds float* %tmp798, i64 1
+  %tmp800 = getelementptr inbounds float* %tmp799, i64 1
+  %tmp801 = getelementptr inbounds float* %tmp800, i64 1
+  %tmp802 = getelementptr inbounds float* %tmp801, i64 1
+  %tmp803 = getelementptr inbounds float* %tmp802, i64 1
+  %tmp804 = getelementptr inbounds float* %tmp803, i64 1
+  %tmp805 = getelementptr inbounds float* %tmp804, i64 1
+  %tmp806 = getelementptr inbounds float* %tmp805, i64 1
+  %tmp807 = getelementptr inbounds float* %tmp806, i64 1
+  %tmp808 = getelementptr inbounds float* %tmp807, i64 1
+  %tmp809 = getelementptr inbounds float* %tmp808, i64 1
+  %tmp810 = getelementptr inbounds float* %tmp809, i64 1
+  %tmp811 = getelementptr inbounds float* %tmp810, i64 1
+  %tmp812 = getelementptr inbounds float* %tmp811, i64 1
+  %tmp813 = getelementptr inbounds float* %tmp812, i64 1
+  %tmp814 = getelementptr inbounds float* %tmp813, i64 1
+  %tmp815 = getelementptr inbounds float* %tmp814, i64 1
+  %tmp816 = getelementptr inbounds float* %tmp815, i64 1
+  %tmp817 = getelementptr inbounds float* %tmp816, i64 1
+  %tmp818 = getelementptr inbounds float* %tmp817, i64 1
+  %tmp819 = getelementptr inbounds float* %tmp818, i64 1
+  %tmp820 = getelementptr inbounds float* %tmp819, i64 1
+  %tmp821 = getelementptr inbounds float* %tmp820, i64 1
+  %tmp822 = getelementptr inbounds float* %tmp821, i64 1
+  %tmp823 = getelementptr inbounds float* %tmp822, i64 1
+  %tmp824 = getelementptr inbounds float* %tmp823, i64 1
+  %tmp825 = getelementptr inbounds float* %tmp824, i64 1
+  %tmp826 = getelementptr inbounds float* %tmp825, i64 1
+  %tmp827 = getelementptr inbounds float* %tmp826, i64 1
+  %tmp828 = getelementptr inbounds float* %tmp827, i64 1
+  %tmp829 = getelementptr inbounds float* %tmp828, i64 1
+  %tmp830 = getelementptr inbounds float* %tmp829, i64 1
+  %tmp831 = getelementptr inbounds float* %tmp830, i64 1
+  %tmp832 = getelementptr inbounds float* %tmp831, i64 1
+  %tmp833 = getelementptr inbounds float* %tmp832, i64 1
+  %tmp834 = getelementptr inbounds float* %tmp833, i64 1
+  %tmp835 = getelementptr inbounds float* %tmp834, i64 1
+  %tmp836 = getelementptr inbounds float* %tmp835, i64 1
+  %tmp837 = getelementptr inbounds float* %tmp836, i64 1
+  %tmp838 = getelementptr inbounds float* %tmp837, i64 1
+  %tmp839 = getelementptr inbounds float* %tmp838, i64 1
+  %tmp840 = getelementptr inbounds float* %tmp839, i64 1
+  %tmp841 = getelementptr inbounds float* %tmp840, i64 1
+  %tmp842 = getelementptr inbounds float* %tmp841, i64 1
+  %tmp843 = getelementptr inbounds float* %tmp842, i64 1
+  %tmp844 = getelementptr inbounds float* %tmp843, i64 1
+  %tmp845 = getelementptr inbounds float* %tmp844, i64 1
+  %tmp846 = getelementptr inbounds float* %tmp845, i64 1
+  %tmp847 = getelementptr inbounds float* %tmp846, i64 1
+  %tmp848 = getelementptr inbounds float* %tmp847, i64 1
+  %tmp849 = getelementptr inbounds float* %tmp848, i64 1
+  %tmp850 = getelementptr inbounds float* %tmp849, i64 1
+  %tmp851 = getelementptr inbounds float* %tmp850, i64 1
+  %tmp852 = getelementptr inbounds float* %tmp851, i64 1
+  %tmp853 = getelementptr inbounds float* %tmp852, i64 1
+  %tmp854 = getelementptr inbounds float* %tmp853, i64 1
+  %tmp855 = getelementptr inbounds float* %tmp854, i64 1
+  %tmp856 = getelementptr inbounds float* %tmp855, i64 1
+  %tmp857 = getelementptr inbounds float* %tmp856, i64 1
+  %tmp858 = getelementptr inbounds float* %tmp857, i64 1
+  %tmp859 = getelementptr inbounds float* %tmp858, i64 1
+  %tmp860 = getelementptr inbounds float* %tmp859, i64 1
+  %tmp861 = getelementptr inbounds float* %tmp860, i64 1
+  %tmp862 = getelementptr inbounds float* %tmp861, i64 1
+  %tmp863 = getelementptr inbounds float* %tmp862, i64 1
+  %tmp864 = getelementptr inbounds float* %tmp863, i64 1
+  %tmp865 = getelementptr inbounds float* %tmp864, i64 1
+  %tmp866 = getelementptr inbounds float* %tmp865, i64 1
+  %tmp867 = getelementptr inbounds float* %tmp866, i64 1
+  %tmp868 = getelementptr inbounds float* %tmp867, i64 1
+  %tmp869 = getelementptr inbounds float* %tmp868, i64 1
+  %tmp870 = getelementptr inbounds float* %tmp869, i64 1
+  %tmp871 = getelementptr inbounds float* %tmp870, i64 1
+  %tmp872 = getelementptr inbounds float* %tmp871, i64 1
+  %tmp873 = getelementptr inbounds float* %tmp872, i64 1
+  %tmp874 = getelementptr inbounds float* %tmp873, i64 1
+  %tmp875 = getelementptr inbounds float* %tmp874, i64 1
+  %tmp876 = getelementptr inbounds float* %tmp875, i64 1
+  %tmp877 = getelementptr inbounds float* %tmp876, i64 1
+  %tmp878 = getelementptr inbounds float* %tmp877, i64 1
+  %tmp879 = getelementptr inbounds float* %tmp878, i64 1
+  %tmp880 = getelementptr inbounds float* %tmp879, i64 1
+  %tmp881 = getelementptr inbounds float* %tmp880, i64 1
+  %tmp882 = getelementptr inbounds float* %tmp881, i64 1
+  %tmp883 = getelementptr inbounds float* %tmp882, i64 1
+  %tmp884 = getelementptr inbounds float* %tmp883, i64 1
+  %tmp885 = getelementptr inbounds float* %tmp884, i64 1
+  %tmp886 = getelementptr inbounds float* %tmp885, i64 1
+  %tmp887 = getelementptr inbounds float* %tmp886, i64 1
+  %tmp888 = getelementptr inbounds float* %tmp887, i64 1
+  %tmp889 = getelementptr inbounds float* %tmp888, i64 1
+  %tmp890 = getelementptr inbounds float* %tmp889, i64 1
+  %tmp891 = getelementptr inbounds float* %tmp890, i64 1
+  %tmp892 = getelementptr inbounds float* %tmp891, i64 1
+  %tmp893 = getelementptr inbounds float* %tmp892, i64 1
+  %tmp894 = getelementptr inbounds float* %tmp893, i64 1
+  %tmp895 = getelementptr inbounds float* %tmp894, i64 1
+  %tmp896 = getelementptr inbounds float* %tmp895, i64 1
+  %tmp897 = getelementptr inbounds float* %tmp896, i64 1
+  %tmp898 = getelementptr inbounds float* %tmp897, i64 1
+  %tmp899 = getelementptr inbounds float* %tmp898, i64 1
+  %tmp900 = getelementptr inbounds float* %tmp899, i64 1
+  %tmp901 = getelementptr inbounds float* %tmp900, i64 1
+  %tmp902 = getelementptr inbounds float* %tmp901, i64 1
+  %tmp903 = getelementptr inbounds float* %tmp902, i64 1
+  %tmp904 = getelementptr inbounds float* %tmp903, i64 1
+  %tmp905 = getelementptr inbounds float* %tmp904, i64 1
+  %tmp906 = getelementptr inbounds float* %tmp905, i64 1
+  %tmp907 = getelementptr inbounds float* %tmp906, i64 1
+  %tmp908 = getelementptr inbounds float* %tmp907, i64 1
+  %tmp909 = getelementptr inbounds float* %tmp908, i64 1
+  %tmp910 = getelementptr inbounds float* %tmp909, i64 1
+  %tmp911 = getelementptr inbounds float* %tmp910, i64 1
+  %tmp912 = getelementptr inbounds float* %tmp911, i64 1
+  %tmp913 = getelementptr inbounds float* %tmp912, i64 1
+  %tmp914 = getelementptr inbounds float* %tmp913, i64 1
+  %tmp915 = getelementptr inbounds float* %tmp914, i64 1
+  %tmp916 = getelementptr inbounds float* %tmp915, i64 1
+  %tmp917 = getelementptr inbounds float* %tmp916, i64 1
+  %tmp918 = getelementptr inbounds float* %tmp917, i64 1
+  %tmp919 = getelementptr inbounds float* %tmp918, i64 1
+  %tmp920 = getelementptr inbounds float* %tmp919, i64 1
+  %tmp921 = getelementptr inbounds float* %tmp920, i64 1
+  %tmp922 = getelementptr inbounds float* %tmp921, i64 1
+  %tmp923 = getelementptr inbounds float* %tmp922, i64 1
+  %tmp924 = getelementptr inbounds float* %tmp923, i64 1
+  %tmp925 = getelementptr inbounds float* %tmp924, i64 1
+  %tmp926 = getelementptr inbounds float* %tmp925, i64 1
+  %tmp927 = getelementptr inbounds float* %tmp926, i64 1
+  %tmp928 = getelementptr inbounds float* %tmp927, i64 1
+  %tmp929 = getelementptr inbounds float* %tmp928, i64 1
+  %tmp930 = getelementptr inbounds float* %tmp929, i64 1
+  %tmp931 = getelementptr inbounds float* %tmp930, i64 1
+  %tmp932 = getelementptr inbounds float* %tmp931, i64 1
+  %tmp933 = getelementptr inbounds float* %tmp932, i64 1
+  %tmp934 = getelementptr inbounds float* %tmp933, i64 1
+  %tmp935 = getelementptr inbounds float* %tmp934, i64 1
+  %tmp936 = getelementptr inbounds float* %tmp935, i64 1
+  %tmp937 = getelementptr inbounds float* %tmp936, i64 1
+  %tmp938 = getelementptr inbounds float* %tmp937, i64 1
+  %tmp939 = getelementptr inbounds float* %tmp938, i64 1
+  %tmp940 = getelementptr inbounds float* %tmp939, i64 1
+  %tmp941 = getelementptr inbounds float* %tmp940, i64 1
+  %tmp942 = getelementptr inbounds float* %tmp941, i64 1
+  %tmp943 = getelementptr inbounds float* %tmp942, i64 1
+  %tmp944 = getelementptr inbounds float* %tmp943, i64 1
+  %tmp945 = getelementptr inbounds float* %tmp944, i64 1
+  %tmp946 = getelementptr inbounds float* %tmp945, i64 1
+  %tmp947 = getelementptr inbounds float* %tmp946, i64 1
+  %tmp948 = getelementptr inbounds float* %tmp947, i64 1
+  %tmp949 = getelementptr inbounds float* %tmp948, i64 1
+  %tmp950 = getelementptr inbounds float* %tmp949, i64 1
+  %tmp951 = getelementptr inbounds float* %tmp950, i64 1
+  %tmp952 = getelementptr inbounds float* %tmp951, i64 1
+  %tmp953 = getelementptr inbounds float* %tmp952, i64 1
+  %tmp954 = getelementptr inbounds float* %tmp953, i64 1
+  %tmp955 = getelementptr inbounds float* %tmp954, i64 1
+  %tmp956 = getelementptr inbounds float* %tmp955, i64 1
+  %tmp957 = getelementptr inbounds float* %tmp956, i64 1
+  %tmp958 = getelementptr inbounds float* %tmp957, i64 1
+  %tmp959 = getelementptr inbounds float* %tmp958, i64 1
+  %tmp960 = getelementptr inbounds float* %tmp959, i64 1
+  %tmp961 = getelementptr inbounds float* %tmp960, i64 1
+  %tmp962 = getelementptr inbounds float* %tmp961, i64 1
+  %tmp963 = getelementptr inbounds float* %tmp962, i64 1
+  %tmp964 = getelementptr inbounds float* %tmp963, i64 1
+  %tmp965 = getelementptr inbounds float* %tmp964, i64 1
+  %tmp966 = getelementptr inbounds float* %tmp965, i64 1
+  %tmp967 = getelementptr inbounds float* %tmp966, i64 1
+  %tmp968 = getelementptr inbounds float* %tmp967, i64 1
+  %tmp969 = getelementptr inbounds float* %tmp968, i64 1
+  %tmp970 = getelementptr inbounds float* %tmp969, i64 1
+  %tmp971 = getelementptr inbounds float* %tmp970, i64 1
+  %tmp972 = getelementptr inbounds float* %tmp971, i64 1
+  %tmp973 = getelementptr inbounds float* %tmp972, i64 1
+  %tmp974 = getelementptr inbounds float* %tmp973, i64 1
+  %tmp975 = getelementptr inbounds float* %tmp974, i64 1
+  %tmp976 = getelementptr inbounds float* %tmp975, i64 1
+  %tmp977 = getelementptr inbounds float* %tmp976, i64 1
+  %tmp978 = getelementptr inbounds float* %tmp977, i64 1
+  %tmp979 = getelementptr inbounds float* %tmp978, i64 1
+  %tmp980 = getelementptr inbounds float* %tmp979, i64 1
+  %tmp981 = getelementptr inbounds float* %tmp980, i64 1
+  %tmp982 = getelementptr inbounds float* %tmp981, i64 1
+  %tmp983 = getelementptr inbounds float* %tmp982, i64 1
+  %tmp984 = getelementptr inbounds float* %tmp983, i64 1
+  %tmp985 = getelementptr inbounds float* %tmp984, i64 1
+  %tmp986 = getelementptr inbounds float* %tmp985, i64 1
+  %tmp987 = getelementptr inbounds float* %tmp986, i64 1
+  %tmp988 = getelementptr inbounds float* %tmp987, i64 1
+  %tmp989 = getelementptr inbounds float* %tmp988, i64 1
+  %tmp990 = getelementptr inbounds float* %tmp989, i64 1
+  %tmp991 = getelementptr inbounds float* %tmp990, i64 1
+  %tmp992 = getelementptr inbounds float* %tmp991, i64 1
+  %tmp993 = getelementptr inbounds float* %tmp992, i64 1
+  %tmp994 = getelementptr inbounds float* %tmp993, i64 1
+  %tmp995 = getelementptr inbounds float* %tmp994, i64 1
+  %tmp996 = getelementptr inbounds float* %tmp995, i64 1
+  %tmp997 = getelementptr inbounds float* %tmp996, i64 1
+  %tmp998 = getelementptr inbounds float* %tmp997, i64 1
+  %tmp999 = getelementptr inbounds float* %tmp998, i64 1
+  %tmp1000 = getelementptr inbounds float* %tmp999, i64 1
+  %tmp1001 = getelementptr inbounds float* %tmp1000, i64 1
+  %tmp1002 = getelementptr inbounds float* %tmp1001, i64 1
+  %tmp1003 = getelementptr inbounds float* %tmp1002, i64 1
+  %tmp1004 = getelementptr inbounds float* %tmp1003, i64 1
+  %tmp1005 = getelementptr inbounds float* %tmp1004, i64 1
+  %tmp1006 = getelementptr inbounds float* %tmp1005, i64 1
+  %tmp1007 = getelementptr inbounds float* %tmp1006, i64 1
+  %tmp1008 = getelementptr inbounds float* %tmp1007, i64 1
+  %tmp1009 = getelementptr inbounds float* %tmp1008, i64 1
+  %tmp1010 = getelementptr inbounds float* %tmp1009, i64 1
+  %tmp1011 = getelementptr inbounds float* %tmp1010, i64 1
+  %tmp1012 = getelementptr inbounds float* %tmp1011, i64 1
+  %tmp1013 = getelementptr inbounds float* %tmp1012, i64 1
+  %tmp1014 = getelementptr inbounds float* %tmp1013, i64 1
+  %tmp1015 = getelementptr inbounds float* %tmp1014, i64 1
+  %tmp1016 = getelementptr inbounds float* %tmp1015, i64 1
+  %tmp1017 = getelementptr inbounds float* %tmp1016, i64 1
+  %tmp1018 = getelementptr inbounds float* %tmp1017, i64 1
+  %tmp1019 = getelementptr inbounds float* %tmp1018, i64 1
+  %tmp1020 = getelementptr inbounds float* %tmp1019, i64 1
+  %tmp1021 = getelementptr inbounds float* %tmp1020, i64 1
+  %tmp1022 = getelementptr inbounds float* %tmp1021, i64 1
+  %tmp1023 = getelementptr inbounds float* %tmp1022, i64 1
+  %tmp1024 = getelementptr inbounds float* %tmp1023, i64 1
+  %tmp1025 = getelementptr inbounds float* %tmp1024, i64 1
+  %tmp1026 = getelementptr inbounds float* %tmp1025, i64 1
+  %tmp1027 = getelementptr inbounds float* %tmp1026, i64 1
+  %tmp1028 = getelementptr inbounds float* %tmp1027, i64 1
+  %tmp1029 = getelementptr inbounds float* %tmp1028, i64 1
+  %tmp1030 = getelementptr inbounds float* %tmp1029, i64 1
+  %tmp1031 = getelementptr inbounds float* %tmp1030, i64 1
+  %tmp1032 = getelementptr inbounds float* %tmp1031, i64 1
+  %tmp1033 = getelementptr inbounds float* %tmp1032, i64 1
+  %tmp1034 = getelementptr inbounds float* %tmp1033, i64 1
+  %tmp1035 = getelementptr inbounds float* %tmp1034, i64 1
+  %tmp1036 = getelementptr inbounds float* %tmp1035, i64 1
+  %tmp1037 = getelementptr inbounds float* %tmp1036, i64 1
+  %tmp1038 = getelementptr inbounds float* %tmp1037, i64 1
+  %tmp1039 = getelementptr inbounds float* %tmp1038, i64 1
+  %tmp1040 = getelementptr inbounds float* %tmp1039, i64 1
+  %tmp1041 = getelementptr inbounds float* %tmp1040, i64 1
+  %tmp1042 = getelementptr inbounds float* %tmp1041, i64 1
+  %tmp1043 = getelementptr inbounds float* %tmp1042, i64 1
+  %tmp1044 = getelementptr inbounds float* %tmp1043, i64 1
+  %tmp1045 = getelementptr inbounds float* %tmp1044, i64 1
+  %tmp1046 = getelementptr inbounds float* %tmp1045, i64 1
+  %tmp1047 = getelementptr inbounds float* %tmp1046, i64 1
+  %tmp1048 = getelementptr inbounds float* %tmp1047, i64 1
+  %tmp1049 = getelementptr inbounds float* %tmp1048, i64 1
+  %tmp1050 = getelementptr inbounds float* %tmp1049, i64 1
+  %tmp1051 = getelementptr inbounds float* %tmp1050, i64 1
+  %tmp1052 = getelementptr inbounds float* %tmp1051, i64 1
+  %tmp1053 = getelementptr inbounds float* %tmp1052, i64 1
+  %tmp1054 = getelementptr inbounds float* %tmp1053, i64 1
+  %tmp1055 = getelementptr inbounds float* %tmp1054, i64 1
+  %tmp1056 = getelementptr inbounds float* %tmp1055, i64 1
+  %tmp1057 = getelementptr inbounds float* %tmp1056, i64 1
+  %tmp1058 = getelementptr inbounds float* %tmp1057, i64 1
+  %tmp1059 = getelementptr inbounds float* %tmp1058, i64 1
+  %tmp1060 = getelementptr inbounds float* %tmp1059, i64 1
+  %tmp1061 = getelementptr inbounds float* %tmp1060, i64 1
+  %tmp1062 = getelementptr inbounds float* %tmp1061, i64 1
+  %tmp1063 = getelementptr inbounds float* %tmp1062, i64 1
+  %tmp1064 = getelementptr inbounds float* %tmp1063, i64 1
+  %tmp1065 = getelementptr inbounds float* %tmp1064, i64 1
+  %tmp1066 = getelementptr inbounds float* %tmp1065, i64 1
+  %tmp1067 = getelementptr inbounds float* %tmp1066, i64 1
+  %tmp1068 = getelementptr inbounds float* %tmp1067, i64 1
+  %tmp1069 = getelementptr inbounds float* %tmp1068, i64 1
+  %tmp1070 = getelementptr inbounds float* %tmp1069, i64 1
+  %tmp1071 = getelementptr inbounds float* %tmp1070, i64 1
+  %tmp1072 = getelementptr inbounds float* %tmp1071, i64 1
+  %tmp1073 = getelementptr inbounds float* %tmp1072, i64 1
+  %tmp1074 = getelementptr inbounds float* %tmp1073, i64 1
+  %tmp1075 = getelementptr inbounds float* %tmp1074, i64 1
+  %tmp1076 = getelementptr inbounds float* %tmp1075, i64 1
+  %tmp1077 = getelementptr inbounds float* %tmp1076, i64 1
+  %tmp1078 = getelementptr inbounds float* %tmp1077, i64 1
+  %tmp1079 = getelementptr inbounds float* %tmp1078, i64 1
+  %tmp1080 = getelementptr inbounds float* %tmp1079, i64 1
+  %tmp1081 = getelementptr inbounds float* %tmp1080, i64 1
+  %tmp1082 = getelementptr inbounds float* %tmp1081, i64 1
+  %tmp1083 = getelementptr inbounds float* %tmp1082, i64 1
+  %tmp1084 = getelementptr inbounds float* %tmp1083, i64 1
+  %tmp1085 = getelementptr inbounds float* %tmp1084, i64 1
+  %tmp1086 = getelementptr inbounds float* %tmp1085, i64 1
+  %tmp1087 = getelementptr inbounds float* %tmp1086, i64 1
+  %tmp1088 = getelementptr inbounds float* %tmp1087, i64 1
+  %tmp1089 = getelementptr inbounds float* %tmp1088, i64 1
+  %tmp1090 = getelementptr inbounds float* %tmp1089, i64 1
+  %tmp1091 = getelementptr inbounds float* %tmp1090, i64 1
+  %tmp1092 = getelementptr inbounds float* %tmp1091, i64 1
+  %tmp1093 = getelementptr inbounds float* %tmp1092, i64 1
+  %tmp1094 = getelementptr inbounds float* %tmp1093, i64 1
+  %tmp1095 = getelementptr inbounds float* %tmp1094, i64 1
+  %tmp1096 = getelementptr inbounds float* %tmp1095, i64 1
+  %tmp1097 = getelementptr inbounds float* %tmp1096, i64 1
+  %tmp1098 = getelementptr inbounds float* %tmp1097, i64 1
+  %tmp1099 = getelementptr inbounds float* %tmp1098, i64 1
+  %tmp1100 = getelementptr inbounds float* %tmp1099, i64 1
+  %tmp1101 = getelementptr inbounds float* %tmp1100, i64 1
+  %tmp1102 = getelementptr inbounds float* %tmp1101, i64 1
+  %tmp1103 = getelementptr inbounds float* %tmp1102, i64 1
+  %tmp1104 = getelementptr inbounds float* %tmp1103, i64 1
+  %tmp1105 = getelementptr inbounds float* %tmp1104, i64 1
+  %tmp1106 = getelementptr inbounds float* %tmp1105, i64 1
+  %tmp1107 = getelementptr inbounds float* %tmp1106, i64 1
+  %tmp1108 = getelementptr inbounds float* %tmp1107, i64 1
+  %tmp1109 = getelementptr inbounds float* %tmp1108, i64 1
+  %tmp1110 = getelementptr inbounds float* %tmp1109, i64 1
+  %tmp1111 = getelementptr inbounds float* %tmp1110, i64 1
+  %tmp1112 = getelementptr inbounds float* %tmp1111, i64 1
+  %tmp1113 = getelementptr inbounds float* %tmp1112, i64 1
+  %tmp1114 = getelementptr inbounds float* %tmp1113, i64 1
+  %tmp1115 = getelementptr inbounds float* %tmp1114, i64 1
+  %tmp1116 = getelementptr inbounds float* %tmp1115, i64 1
+  %tmp1117 = getelementptr inbounds float* %tmp1116, i64 1
+  %tmp1118 = getelementptr inbounds float* %tmp1117, i64 1
+  %tmp1119 = getelementptr inbounds float* %tmp1118, i64 1
+  %tmp1120 = getelementptr inbounds float* %tmp1119, i64 1
+  %tmp1121 = getelementptr inbounds float* %tmp1120, i64 1
+  %tmp1122 = getelementptr inbounds float* %tmp1121, i64 1
+  %tmp1123 = getelementptr inbounds float* %tmp1122, i64 1
+  %tmp1124 = getelementptr inbounds float* %tmp1123, i64 1
+  %tmp1125 = getelementptr inbounds float* %tmp1124, i64 1
+  %tmp1126 = getelementptr inbounds float* %tmp1125, i64 1
+  %tmp1127 = getelementptr inbounds float* %tmp1126, i64 1
+  %tmp1128 = getelementptr inbounds float* %tmp1127, i64 1
+  %tmp1129 = getelementptr inbounds float* %tmp1128, i64 1
+  %tmp1130 = getelementptr inbounds float* %tmp1129, i64 1
+  %tmp1131 = getelementptr inbounds float* %tmp1130, i64 1
+  %tmp1132 = getelementptr inbounds float* %tmp1131, i64 1
+  %tmp1133 = getelementptr inbounds float* %tmp1132, i64 1
+  %tmp1134 = getelementptr inbounds float* %tmp1133, i64 1
+  %tmp1135 = getelementptr inbounds float* %tmp1134, i64 1
+  %tmp1136 = getelementptr inbounds float* %tmp1135, i64 1
+  %tmp1137 = getelementptr inbounds float* %tmp1136, i64 1
+  %tmp1138 = getelementptr inbounds float* %tmp1137, i64 1
+  %tmp1139 = getelementptr inbounds float* %tmp1138, i64 1
+  %tmp1140 = getelementptr inbounds float* %tmp1139, i64 1
+  %tmp1141 = getelementptr inbounds float* %tmp1140, i64 1
+  %tmp1142 = getelementptr inbounds float* %tmp1141, i64 1
+  %tmp1143 = getelementptr inbounds float* %tmp1142, i64 1
+  %tmp1144 = getelementptr inbounds float* %tmp1143, i64 1
+  %tmp1145 = getelementptr inbounds float* %tmp1144, i64 1
+  %tmp1146 = getelementptr inbounds float* %tmp1145, i64 1
+  %tmp1147 = getelementptr inbounds float* %tmp1146, i64 1
+  %tmp1148 = getelementptr inbounds float* %tmp1147, i64 1
+  %tmp1149 = getelementptr inbounds float* %tmp1148, i64 1
+  %tmp1150 = getelementptr inbounds float* %tmp1149, i64 1
+  %tmp1151 = getelementptr inbounds float* %tmp1150, i64 1
+  %tmp1152 = getelementptr inbounds float* %tmp1151, i64 1
+  %tmp1153 = getelementptr inbounds float* %tmp1152, i64 1
+  %tmp1154 = getelementptr inbounds float* %tmp1153, i64 1
+  %tmp1155 = getelementptr inbounds float* %tmp1154, i64 1
+  %tmp1156 = getelementptr inbounds float* %tmp1155, i64 1
+  %tmp1157 = getelementptr inbounds float* %tmp1156, i64 1
+  %tmp1158 = getelementptr inbounds float* %tmp1157, i64 1
+  %tmp1159 = getelementptr inbounds float* %tmp1158, i64 1
+  %tmp1160 = getelementptr inbounds float* %tmp1159, i64 1
+  %tmp1161 = getelementptr inbounds float* %tmp1160, i64 1
+  %tmp1162 = getelementptr inbounds float* %tmp1161, i64 1
+  %tmp1163 = getelementptr inbounds float* %tmp1162, i64 1
+  %tmp1164 = getelementptr inbounds float* %tmp1163, i64 1
+  %tmp1165 = getelementptr inbounds float* %tmp1164, i64 1
+  %tmp1166 = getelementptr inbounds float* %tmp1165, i64 1
+  %tmp1167 = getelementptr inbounds float* %tmp1166, i64 1
+  %tmp1168 = getelementptr inbounds float* %tmp1167, i64 1
+  %tmp1169 = getelementptr inbounds float* %tmp1168, i64 1
+  %tmp1170 = getelementptr inbounds float* %tmp1169, i64 1
+  %tmp1171 = getelementptr inbounds float* %tmp1170, i64 1
+  %tmp1172 = getelementptr inbounds float* %tmp1171, i64 1
+  %tmp1173 = getelementptr inbounds float* %tmp1172, i64 1
+  %tmp1174 = getelementptr inbounds float* %tmp1173, i64 1
+  %tmp1175 = getelementptr inbounds float* %tmp1174, i64 1
+  %tmp1176 = getelementptr inbounds float* %tmp1175, i64 1
+  %tmp1177 = getelementptr inbounds float* %tmp1176, i64 1
+  %tmp1178 = getelementptr inbounds float* %tmp1177, i64 1
+  %tmp1179 = getelementptr inbounds float* %tmp1178, i64 1
+  %tmp1180 = getelementptr inbounds float* %tmp1179, i64 1
+  %tmp1181 = getelementptr inbounds float* %tmp1180, i64 1
+  %tmp1182 = getelementptr inbounds float* %tmp1181, i64 1
+  %tmp1183 = getelementptr inbounds float* %tmp1182, i64 1
+  %tmp1184 = getelementptr inbounds float* %tmp1183, i64 1
+  %tmp1185 = getelementptr inbounds float* %tmp1184, i64 1
+  %tmp1186 = getelementptr inbounds float* %tmp1185, i64 1
+  %tmp1187 = getelementptr inbounds float* %tmp1186, i64 1
+  %tmp1188 = getelementptr inbounds float* %tmp1187, i64 1
+  %tmp1189 = getelementptr inbounds float* %tmp1188, i64 1
+  %tmp1190 = getelementptr inbounds float* %tmp1189, i64 1
+  %tmp1191 = getelementptr inbounds float* %tmp1190, i64 1
+  %tmp1192 = getelementptr inbounds float* %tmp1191, i64 1
+  %tmp1193 = getelementptr inbounds float* %tmp1192, i64 1
+  %tmp1194 = getelementptr inbounds float* %tmp1193, i64 1
+  %tmp1195 = getelementptr inbounds float* %tmp1194, i64 1
+  %tmp1196 = getelementptr inbounds float* %tmp1195, i64 1
+  %tmp1197 = getelementptr inbounds float* %tmp1196, i64 1
+  %tmp1198 = getelementptr inbounds float* %tmp1197, i64 1
+  %tmp1199 = getelementptr inbounds float* %tmp1198, i64 1
+  %tmp1200 = getelementptr inbounds float* %tmp1199, i64 1
+  %tmp1201 = getelementptr inbounds float* %tmp1200, i64 1
+  %tmp1202 = getelementptr inbounds float* %tmp1201, i64 1
+  %tmp1203 = getelementptr inbounds float* %tmp1202, i64 1
+  %tmp1204 = getelementptr inbounds float* %tmp1203, i64 1
+  %tmp1205 = getelementptr inbounds float* %tmp1204, i64 1
+  %tmp1206 = getelementptr inbounds float* %tmp1205, i64 1
+  %tmp1207 = getelementptr inbounds float* %tmp1206, i64 1
+  %tmp1208 = getelementptr inbounds float* %tmp1207, i64 1
+  %tmp1209 = getelementptr inbounds float* %tmp1208, i64 1
+  %tmp1210 = getelementptr inbounds float* %tmp1209, i64 1
+  %tmp1211 = getelementptr inbounds float* %tmp1210, i64 1
+  %tmp1212 = getelementptr inbounds float* %tmp1211, i64 1
+  %tmp1213 = getelementptr inbounds float* %tmp1212, i64 1
+  %tmp1214 = getelementptr inbounds float* %tmp1213, i64 1
+  %tmp1215 = getelementptr inbounds float* %tmp1214, i64 1
+  %tmp1216 = getelementptr inbounds float* %tmp1215, i64 1
+  %tmp1217 = getelementptr inbounds float* %tmp1216, i64 1
+  %tmp1218 = getelementptr inbounds float* %tmp1217, i64 1
+  %tmp1219 = getelementptr inbounds float* %tmp1218, i64 1
+  %tmp1220 = getelementptr inbounds float* %tmp1219, i64 1
+  %tmp1221 = getelementptr inbounds float* %tmp1220, i64 1
+  %tmp1222 = getelementptr inbounds float* %tmp1221, i64 1
+  %tmp1223 = getelementptr inbounds float* %tmp1222, i64 1
+  %tmp1224 = getelementptr inbounds float* %tmp1223, i64 1
+  %tmp1225 = getelementptr inbounds float* %tmp1224, i64 1
+  %tmp1226 = getelementptr inbounds float* %tmp1225, i64 1
+  %tmp1227 = getelementptr inbounds float* %tmp1226, i64 1
+  %tmp1228 = getelementptr inbounds float* %tmp1227, i64 1
+  %tmp1229 = getelementptr inbounds float* %tmp1228, i64 1
+  %tmp1230 = getelementptr inbounds float* %tmp1229, i64 1
+  %tmp1231 = getelementptr inbounds float* %tmp1230, i64 1
+  %tmp1232 = getelementptr inbounds float* %tmp1231, i64 1
+  %tmp1233 = getelementptr inbounds float* %tmp1232, i64 1
+  %tmp1234 = getelementptr inbounds float* %tmp1233, i64 1
+  %tmp1235 = getelementptr inbounds float* %tmp1234, i64 1
+  %tmp1236 = getelementptr inbounds float* %tmp1235, i64 1
+  %tmp1237 = getelementptr inbounds float* %tmp1236, i64 1
+  %tmp1238 = getelementptr inbounds float* %tmp1237, i64 1
+  %tmp1239 = getelementptr inbounds float* %tmp1238, i64 1
+  %tmp1240 = getelementptr inbounds float* %tmp1239, i64 1
+  %tmp1241 = getelementptr inbounds float* %tmp1240, i64 1
+  %tmp1242 = getelementptr inbounds float* %tmp1241, i64 1
+  %tmp1243 = getelementptr inbounds float* %tmp1242, i64 1
+  %tmp1244 = getelementptr inbounds float* %tmp1243, i64 1
+  %tmp1245 = getelementptr inbounds float* %tmp1244, i64 1
+  %tmp1246 = getelementptr inbounds float* %tmp1245, i64 1
+  %tmp1247 = getelementptr inbounds float* %tmp1246, i64 1
+  %tmp1248 = getelementptr inbounds float* %tmp1247, i64 1
+  %tmp1249 = getelementptr inbounds float* %tmp1248, i64 1
+  %tmp1250 = getelementptr inbounds float* %tmp1249, i64 1
+  %tmp1251 = getelementptr inbounds float* %tmp1250, i64 1
+  %tmp1252 = getelementptr inbounds float* %tmp1251, i64 1
+  %tmp1253 = getelementptr inbounds float* %tmp1252, i64 1
+  %tmp1254 = getelementptr inbounds float* %tmp1253, i64 1
+  %tmp1255 = getelementptr inbounds float* %tmp1254, i64 1
+  %tmp1256 = getelementptr inbounds float* %tmp1255, i64 1
+  %tmp1257 = getelementptr inbounds float* %tmp1256, i64 1
+  %tmp1258 = getelementptr inbounds float* %tmp1257, i64 1
+  %tmp1259 = getelementptr inbounds float* %tmp1258, i64 1
+  %tmp1260 = getelementptr inbounds float* %tmp1259, i64 1
+  %tmp1261 = getelementptr inbounds float* %tmp1260, i64 1
+  %tmp1262 = getelementptr inbounds float* %tmp1261, i64 1
+  %tmp1263 = getelementptr inbounds float* %tmp1262, i64 1
+  %tmp1264 = getelementptr inbounds float* %tmp1263, i64 1
+  %tmp1265 = getelementptr inbounds float* %tmp1264, i64 1
+  %tmp1266 = getelementptr inbounds float* %tmp1265, i64 1
+  %tmp1267 = getelementptr inbounds float* %tmp1266, i64 1
+  %tmp1268 = getelementptr inbounds float* %tmp1267, i64 1
+  %tmp1269 = getelementptr inbounds float* %tmp1268, i64 1
+  %tmp1270 = getelementptr inbounds float* %tmp1269, i64 1
+  %tmp1271 = getelementptr inbounds float* %tmp1270, i64 1
+  %tmp1272 = getelementptr inbounds float* %tmp1271, i64 1
+  %tmp1273 = getelementptr inbounds float* %tmp1272, i64 1
+  %tmp1274 = getelementptr inbounds float* %tmp1273, i64 1
+  %tmp1275 = getelementptr inbounds float* %tmp1274, i64 1
+  %tmp1276 = getelementptr inbounds float* %tmp1275, i64 1
+  %tmp1277 = getelementptr inbounds float* %tmp1276, i64 1
+  %tmp1278 = getelementptr inbounds float* %tmp1277, i64 1
+  %tmp1279 = getelementptr inbounds float* %tmp1278, i64 1
+  %tmp1280 = getelementptr inbounds float* %tmp1279, i64 1
+  %tmp1281 = getelementptr inbounds float* %tmp1280, i64 1
+  %tmp1282 = getelementptr inbounds float* %tmp1281, i64 1
+  %tmp1283 = getelementptr inbounds float* %tmp1282, i64 1
+  %tmp1284 = getelementptr inbounds float* %tmp1283, i64 1
+  %tmp1285 = getelementptr inbounds float* %tmp1284, i64 1
+  %tmp1286 = getelementptr inbounds float* %tmp1285, i64 1
+  %tmp1287 = getelementptr inbounds float* %tmp1286, i64 1
+  %tmp1288 = getelementptr inbounds float* %tmp1287, i64 1
+  %tmp1289 = getelementptr inbounds float* %tmp1288, i64 1
+  %tmp1290 = getelementptr inbounds float* %tmp1289, i64 1
+  %tmp1291 = getelementptr inbounds float* %tmp1290, i64 1
+  %tmp1292 = getelementptr inbounds float* %tmp1291, i64 1
+  %tmp1293 = getelementptr inbounds float* %tmp1292, i64 1
+  %tmp1294 = getelementptr inbounds float* %tmp1293, i64 1
+  %tmp1295 = getelementptr inbounds float* %tmp1294, i64 1
+  %tmp1296 = getelementptr inbounds float* %tmp1295, i64 1
+  %tmp1297 = getelementptr inbounds float* %tmp1296, i64 1
+  %tmp1298 = getelementptr inbounds float* %tmp1297, i64 1
+  %tmp1299 = getelementptr inbounds float* %tmp1298, i64 1
+  %tmp1300 = getelementptr inbounds float* %tmp1299, i64 1
+  %tmp1301 = getelementptr inbounds float* %tmp1300, i64 1
+  %tmp1302 = getelementptr inbounds float* %tmp1301, i64 1
+  %tmp1303 = getelementptr inbounds float* %tmp1302, i64 1
+  %tmp1304 = getelementptr inbounds float* %tmp1303, i64 1
+  %tmp1305 = getelementptr inbounds float* %tmp1304, i64 1
+  %tmp1306 = getelementptr inbounds float* %tmp1305, i64 1
+  %tmp1307 = getelementptr inbounds float* %tmp1306, i64 1
+  %tmp1308 = getelementptr inbounds float* %tmp1307, i64 1
+  %tmp1309 = getelementptr inbounds float* %tmp1308, i64 1
+  %tmp1310 = getelementptr inbounds float* %tmp1309, i64 1
+  %tmp1311 = getelementptr inbounds float* %tmp1310, i64 1
+  %tmp1312 = getelementptr inbounds float* %tmp1311, i64 1
+  %tmp1313 = getelementptr inbounds float* %tmp1312, i64 1
+  %tmp1314 = getelementptr inbounds float* %tmp1313, i64 1
+  %tmp1315 = getelementptr inbounds float* %tmp1314, i64 1
+  %tmp1316 = getelementptr inbounds float* %tmp1315, i64 1
+  %tmp1317 = getelementptr inbounds float* %tmp1316, i64 1
+  %tmp1318 = getelementptr inbounds float* %tmp1317, i64 1
+  %tmp1319 = getelementptr inbounds float* %tmp1318, i64 1
+  %tmp1320 = getelementptr inbounds float* %tmp1319, i64 1
+  %tmp1321 = getelementptr inbounds float* %tmp1320, i64 1
+  %tmp1322 = getelementptr inbounds float* %tmp1321, i64 1
+  %tmp1323 = getelementptr inbounds float* %tmp1322, i64 1
+  %tmp1324 = getelementptr inbounds float* %tmp1323, i64 1
+  %tmp1325 = getelementptr inbounds float* %tmp1324, i64 1
+  %tmp1326 = getelementptr inbounds float* %tmp1325, i64 1
+  %tmp1327 = getelementptr inbounds float* %tmp1326, i64 1
+  %tmp1328 = getelementptr inbounds float* %tmp1327, i64 1
+  %tmp1329 = getelementptr inbounds float* %tmp1328, i64 1
+  %tmp1330 = getelementptr inbounds float* %tmp1329, i64 1
+  %tmp1331 = getelementptr inbounds float* %tmp1330, i64 1
+  %tmp1332 = getelementptr inbounds float* %tmp1331, i64 1
+  %tmp1333 = getelementptr inbounds float* %tmp1332, i64 1
+  %tmp1334 = getelementptr inbounds float* %tmp1333, i64 1
+  %tmp1335 = getelementptr inbounds float* %tmp1334, i64 1
+  %tmp1336 = getelementptr inbounds float* %tmp1335, i64 1
+  %tmp1337 = getelementptr inbounds float* %tmp1336, i64 1
+  %tmp1338 = getelementptr inbounds float* %tmp1337, i64 1
+  %tmp1339 = getelementptr inbounds float* %tmp1338, i64 1
+  %tmp1340 = getelementptr inbounds float* %tmp1339, i64 1
+  %tmp1341 = getelementptr inbounds float* %tmp1340, i64 1
+  %tmp1342 = getelementptr inbounds float* %tmp1341, i64 1
+  %tmp1343 = getelementptr inbounds float* %tmp1342, i64 1
+  %tmp1344 = getelementptr inbounds float* %tmp1343, i64 1
+  %tmp1345 = getelementptr inbounds float* %tmp1344, i64 1
+  %tmp1346 = getelementptr inbounds float* %tmp1345, i64 1
+  %tmp1347 = getelementptr inbounds float* %tmp1346, i64 1
+  %tmp1348 = getelementptr inbounds float* %tmp1347, i64 1
+  %tmp1349 = getelementptr inbounds float* %tmp1348, i64 1
+  %tmp1350 = getelementptr inbounds float* %tmp1349, i64 1
+  %tmp1351 = getelementptr inbounds float* %tmp1350, i64 1
+  %tmp1352 = getelementptr inbounds float* %tmp1351, i64 1
+  %tmp1353 = getelementptr inbounds float* %tmp1352, i64 1
+  %tmp1354 = getelementptr inbounds float* %tmp1353, i64 1
+  %tmp1355 = getelementptr inbounds float* %tmp1354, i64 1
+  %tmp1356 = getelementptr inbounds float* %tmp1355, i64 1
+  %tmp1357 = getelementptr inbounds float* %tmp1356, i64 1
+  %tmp1358 = getelementptr inbounds float* %tmp1357, i64 1
+  %tmp1359 = getelementptr inbounds float* %tmp1358, i64 1
+  %tmp1360 = getelementptr inbounds float* %tmp1359, i64 1
+  %tmp1361 = getelementptr inbounds float* %tmp1360, i64 1
+  %tmp1362 = getelementptr inbounds float* %tmp1361, i64 1
+  %tmp1363 = getelementptr inbounds float* %tmp1362, i64 1
+  %tmp1364 = getelementptr inbounds float* %tmp1363, i64 1
+  %tmp1365 = getelementptr inbounds float* %tmp1364, i64 1
+  %tmp1366 = getelementptr inbounds float* %tmp1365, i64 1
+  %tmp1367 = getelementptr inbounds float* %tmp1366, i64 1
+  %tmp1368 = getelementptr inbounds float* %tmp1367, i64 1
+  %tmp1369 = getelementptr inbounds float* %tmp1368, i64 1
+  %tmp1370 = getelementptr inbounds float* %tmp1369, i64 1
+  %tmp1371 = getelementptr inbounds float* %tmp1370, i64 1
+  %tmp1372 = getelementptr inbounds float* %tmp1371, i64 1
+  %tmp1373 = getelementptr inbounds float* %tmp1372, i64 1
+  %tmp1374 = getelementptr inbounds float* %tmp1373, i64 1
+  %tmp1375 = getelementptr inbounds float* %tmp1374, i64 1
+  %tmp1376 = getelementptr inbounds float* %tmp1375, i64 1
+  %tmp1377 = getelementptr inbounds float* %tmp1376, i64 1
+  %tmp1378 = getelementptr inbounds float* %tmp1377, i64 1
+  %tmp1379 = getelementptr inbounds float* %tmp1378, i64 1
+  %tmp1380 = getelementptr inbounds float* %tmp1379, i64 1
+  %tmp1381 = getelementptr inbounds float* %tmp1380, i64 1
+  %tmp1382 = getelementptr inbounds float* %tmp1381, i64 1
+  %tmp1383 = getelementptr inbounds float* %tmp1382, i64 1
+  %tmp1384 = getelementptr inbounds float* %tmp1383, i64 1
+  %tmp1385 = getelementptr inbounds float* %tmp1384, i64 1
+  %tmp1386 = getelementptr inbounds float* %tmp1385, i64 1
+  %tmp1387 = getelementptr inbounds float* %tmp1386, i64 1
+  %tmp1388 = getelementptr inbounds float* %tmp1387, i64 1
+  %tmp1389 = getelementptr inbounds float* %tmp1388, i64 1
+  %tmp1390 = getelementptr inbounds float* %tmp1389, i64 1
+  %tmp1391 = getelementptr inbounds float* %tmp1390, i64 1
+  %tmp1392 = getelementptr inbounds float* %tmp1391, i64 1
+  %tmp1393 = getelementptr inbounds float* %tmp1392, i64 1
+  %tmp1394 = getelementptr inbounds float* %tmp1393, i64 1
+  %tmp1395 = getelementptr inbounds float* %tmp1394, i64 1
+  %tmp1396 = getelementptr inbounds float* %tmp1395, i64 1
+  %tmp1397 = getelementptr inbounds float* %tmp1396, i64 1
+  %tmp1398 = getelementptr inbounds float* %tmp1397, i64 1
+  %tmp1399 = getelementptr inbounds float* %tmp1398, i64 1
+  %tmp1400 = getelementptr inbounds float* %tmp1399, i64 1
+  %tmp1401 = getelementptr inbounds float* %tmp1400, i64 1
+  %tmp1402 = getelementptr inbounds float* %tmp1401, i64 1
+  %tmp1403 = getelementptr inbounds float* %tmp1402, i64 1
+  %tmp1404 = getelementptr inbounds float* %tmp1403, i64 1
+  %tmp1405 = getelementptr inbounds float* %tmp1404, i64 1
+  %tmp1406 = getelementptr inbounds float* %tmp1405, i64 1
+  %tmp1407 = getelementptr inbounds float* %tmp1406, i64 1
+  %tmp1408 = getelementptr inbounds float* %tmp1407, i64 1
+  %tmp1409 = getelementptr inbounds float* %tmp1408, i64 1
+  %tmp1410 = getelementptr inbounds float* %tmp1409, i64 1
+  %tmp1411 = getelementptr inbounds float* %tmp1410, i64 1
+  %tmp1412 = getelementptr inbounds float* %tmp1411, i64 1
+  %tmp1413 = getelementptr inbounds float* %tmp1412, i64 1
+  %tmp1414 = getelementptr inbounds float* %tmp1413, i64 1
+  %tmp1415 = getelementptr inbounds float* %tmp1414, i64 1
+  %tmp1416 = getelementptr inbounds float* %tmp1415, i64 1
+  %tmp1417 = getelementptr inbounds float* %tmp1416, i64 1
+  %tmp1418 = getelementptr inbounds float* %tmp1417, i64 1
+  %tmp1419 = getelementptr inbounds float* %tmp1418, i64 1
+  %tmp1420 = getelementptr inbounds float* %tmp1419, i64 1
+  %tmp1421 = getelementptr inbounds float* %tmp1420, i64 1
+  %tmp1422 = getelementptr inbounds float* %tmp1421, i64 1
+  %tmp1423 = getelementptr inbounds float* %tmp1422, i64 1
+  %tmp1424 = getelementptr inbounds float* %tmp1423, i64 1
+  %tmp1425 = getelementptr inbounds float* %tmp1424, i64 1
+  %tmp1426 = getelementptr inbounds float* %tmp1425, i64 1
+  %tmp1427 = getelementptr inbounds float* %tmp1426, i64 1
+  %tmp1428 = getelementptr inbounds float* %tmp1427, i64 1
+  %tmp1429 = getelementptr inbounds float* %tmp1428, i64 1
+  %tmp1430 = getelementptr inbounds float* %tmp1429, i64 1
+  %tmp1431 = getelementptr inbounds float* %tmp1430, i64 1
+  %tmp1432 = getelementptr inbounds float* %tmp1431, i64 1
+  %tmp1433 = getelementptr inbounds float* %tmp1432, i64 1
+  %tmp1434 = getelementptr inbounds float* %tmp1433, i64 1
+  %tmp1435 = getelementptr inbounds float* %tmp1434, i64 1
+  %tmp1436 = getelementptr inbounds float* %tmp1435, i64 1
+  %tmp1437 = getelementptr inbounds float* %tmp1436, i64 1
+  %tmp1438 = getelementptr inbounds float* %tmp1437, i64 1
+  %tmp1439 = getelementptr inbounds float* %tmp1438, i64 1
+  %tmp1440 = getelementptr inbounds float* %tmp1439, i64 1
+  %tmp1441 = getelementptr inbounds float* %tmp1440, i64 1
+  %tmp1442 = getelementptr inbounds float* %tmp1441, i64 1
+  %tmp1443 = getelementptr inbounds float* %tmp1442, i64 1
+  %tmp1444 = getelementptr inbounds float* %tmp1443, i64 1
+  %tmp1445 = getelementptr inbounds float* %tmp1444, i64 1
+  %tmp1446 = getelementptr inbounds float* %tmp1445, i64 1
+  %tmp1447 = getelementptr inbounds float* %tmp1446, i64 1
+  %tmp1448 = getelementptr inbounds float* %tmp1447, i64 1
+  %tmp1449 = getelementptr inbounds float* %tmp1448, i64 1
+  %tmp1450 = getelementptr inbounds float* %tmp1449, i64 1
+  %tmp1451 = getelementptr inbounds float* %tmp1450, i64 1
+  %tmp1452 = getelementptr inbounds float* %tmp1451, i64 1
+  %tmp1453 = getelementptr inbounds float* %tmp1452, i64 1
+  %tmp1454 = getelementptr inbounds float* %tmp1453, i64 1
+  %tmp1455 = getelementptr inbounds float* %tmp1454, i64 1
+  %tmp1456 = getelementptr inbounds float* %tmp1455, i64 1
+  %tmp1457 = getelementptr inbounds float* %tmp1456, i64 1
+  %tmp1458 = getelementptr inbounds float* %tmp1457, i64 1
+  %tmp1459 = getelementptr inbounds float* %tmp1458, i64 1
+  %tmp1460 = getelementptr inbounds float* %tmp1459, i64 1
+  %tmp1461 = getelementptr inbounds float* %tmp1460, i64 1
+  %tmp1462 = getelementptr inbounds float* %tmp1461, i64 1
+  %tmp1463 = getelementptr inbounds float* %tmp1462, i64 1
+  %tmp1464 = getelementptr inbounds float* %tmp1463, i64 1
+  %tmp1465 = getelementptr inbounds float* %tmp1464, i64 1
+  %tmp1466 = getelementptr inbounds float* %tmp1465, i64 1
+  %tmp1467 = getelementptr inbounds float* %tmp1466, i64 1
+  %tmp1468 = getelementptr inbounds float* %tmp1467, i64 1
+  %tmp1469 = getelementptr inbounds float* %tmp1468, i64 1
+  %tmp1470 = getelementptr inbounds float* %tmp1469, i64 1
+  %tmp1471 = getelementptr inbounds float* %tmp1470, i64 1
+  %tmp1472 = getelementptr inbounds float* %tmp1471, i64 1
+  %tmp1473 = getelementptr inbounds float* %tmp1472, i64 1
+  %tmp1474 = getelementptr inbounds float* %tmp1473, i64 1
+  %tmp1475 = getelementptr inbounds float* %tmp1474, i64 1
+  %tmp1476 = getelementptr inbounds float* %tmp1475, i64 1
+  %tmp1477 = getelementptr inbounds float* %tmp1476, i64 1
+  %tmp1478 = getelementptr inbounds float* %tmp1477, i64 1
+  %tmp1479 = getelementptr inbounds float* %tmp1478, i64 1
+  %tmp1480 = getelementptr inbounds float* %tmp1479, i64 1
+  %tmp1481 = getelementptr inbounds float* %tmp1480, i64 1
+  %tmp1482 = getelementptr inbounds float* %tmp1481, i64 1
+  %tmp1483 = getelementptr inbounds float* %tmp1482, i64 1
+  %tmp1484 = getelementptr inbounds float* %tmp1483, i64 1
+  %tmp1485 = getelementptr inbounds float* %tmp1484, i64 1
+  %tmp1486 = getelementptr inbounds float* %tmp1485, i64 1
+  %tmp1487 = getelementptr inbounds float* %tmp1486, i64 1
+  %tmp1488 = getelementptr inbounds float* %tmp1487, i64 1
+  %tmp1489 = getelementptr inbounds float* %tmp1488, i64 1
+  %tmp1490 = getelementptr inbounds float* %tmp1489, i64 1
+  %tmp1491 = getelementptr inbounds float* %tmp1490, i64 1
+  %tmp1492 = getelementptr inbounds float* %tmp1491, i64 1
+  %tmp1493 = getelementptr inbounds float* %tmp1492, i64 1
+  %tmp1494 = getelementptr inbounds float* %tmp1493, i64 1
+  %tmp1495 = getelementptr inbounds float* %tmp1494, i64 1
+  %tmp1496 = getelementptr inbounds float* %tmp1495, i64 1
+  %tmp1497 = getelementptr inbounds float* %tmp1496, i64 1
+  %tmp1498 = getelementptr inbounds float* %tmp1497, i64 1
+  %tmp1499 = getelementptr inbounds float* %tmp1498, i64 1
+  %tmp1500 = getelementptr inbounds float* %tmp1499, i64 1
+  %tmp1501 = getelementptr inbounds float* %tmp1500, i64 1
+  %tmp1502 = getelementptr inbounds float* %tmp1501, i64 1
+  %tmp1503 = getelementptr inbounds float* %tmp1502, i64 1
+  %tmp1504 = getelementptr inbounds float* %tmp1503, i64 1
+  %tmp1505 = getelementptr inbounds float* %tmp1504, i64 1
+  %tmp1506 = getelementptr inbounds float* %tmp1505, i64 1
+  %tmp1507 = getelementptr inbounds float* %tmp1506, i64 1
+  %tmp1508 = getelementptr inbounds float* %tmp1507, i64 1
+  %tmp1509 = getelementptr inbounds float* %tmp1508, i64 1
+  %tmp1510 = getelementptr inbounds float* %tmp1509, i64 1
+  %tmp1511 = getelementptr inbounds float* %tmp1510, i64 1
+  %tmp1512 = getelementptr inbounds float* %tmp1511, i64 1
+  %tmp1513 = getelementptr inbounds float* %tmp1512, i64 1
+  %tmp1514 = getelementptr inbounds float* %tmp1513, i64 1
+  %tmp1515 = getelementptr inbounds float* %tmp1514, i64 1
+  %tmp1516 = getelementptr inbounds float* %tmp1515, i64 1
+  %tmp1517 = getelementptr inbounds float* %tmp1516, i64 1
+  %tmp1518 = getelementptr inbounds float* %tmp1517, i64 1
+  %tmp1519 = getelementptr inbounds float* %tmp1518, i64 1
+  %tmp1520 = getelementptr inbounds float* %tmp1519, i64 1
+  %tmp1521 = getelementptr inbounds float* %tmp1520, i64 1
+  %tmp1522 = getelementptr inbounds float* %tmp1521, i64 1
+  %tmp1523 = getelementptr inbounds float* %tmp1522, i64 1
+  %tmp1524 = getelementptr inbounds float* %tmp1523, i64 1
+  %tmp1525 = getelementptr inbounds float* %tmp1524, i64 1
+  %tmp1526 = getelementptr inbounds float* %tmp1525, i64 1
+  %tmp1527 = getelementptr inbounds float* %tmp1526, i64 1
+  %tmp1528 = getelementptr inbounds float* %tmp1527, i64 1
+  %tmp1529 = getelementptr inbounds float* %tmp1528, i64 1
+  %tmp1530 = getelementptr inbounds float* %tmp1529, i64 1
+  %tmp1531 = getelementptr inbounds float* %tmp1530, i64 1
+  %tmp1532 = getelementptr inbounds float* %tmp1531, i64 1
+  %tmp1533 = getelementptr inbounds float* %tmp1532, i64 1
+  %tmp1534 = getelementptr inbounds float* %tmp1533, i64 1
+  %tmp1535 = getelementptr inbounds float* %tmp1534, i64 1
+  %tmp1536 = getelementptr inbounds float* %tmp1535, i64 1
+  %tmp1537 = getelementptr inbounds float* %tmp1536, i64 1
+  %tmp1538 = getelementptr inbounds float* %tmp1537, i64 1
+  %tmp1539 = getelementptr inbounds float* %tmp1538, i64 1
+  %tmp1540 = getelementptr inbounds float* %tmp1539, i64 1
+  %tmp1541 = getelementptr inbounds float* %tmp1540, i64 1
+  %tmp1542 = getelementptr inbounds float* %tmp1541, i64 1
+  %tmp1543 = getelementptr inbounds float* %tmp1542, i64 1
+  %tmp1544 = getelementptr inbounds float* %tmp1543, i64 1
+  %tmp1545 = getelementptr inbounds float* %tmp1544, i64 1
+  %tmp1546 = getelementptr inbounds float* %tmp1545, i64 1
+  %tmp1547 = getelementptr inbounds float* %tmp1546, i64 1
+  %tmp1548 = getelementptr inbounds float* %tmp1547, i64 1
+  %tmp1549 = getelementptr inbounds float* %tmp1548, i64 1
+  %tmp1550 = getelementptr inbounds float* %tmp1549, i64 1
+  %tmp1551 = getelementptr inbounds float* %tmp1550, i64 1
+  %tmp1552 = getelementptr inbounds float* %tmp1551, i64 1
+  %tmp1553 = getelementptr inbounds float* %tmp1552, i64 1
+  %tmp1554 = getelementptr inbounds float* %tmp1553, i64 1
+  %tmp1555 = getelementptr inbounds float* %tmp1554, i64 1
+  %tmp1556 = getelementptr inbounds float* %tmp1555, i64 1
+  %tmp1557 = getelementptr inbounds float* %tmp1556, i64 1
+  %tmp1558 = getelementptr inbounds float* %tmp1557, i64 1
+  %tmp1559 = getelementptr inbounds float* %tmp1558, i64 1
+  %tmp1560 = getelementptr inbounds float* %tmp1559, i64 1
+  %tmp1561 = getelementptr inbounds float* %tmp1560, i64 1
+  %tmp1562 = getelementptr inbounds float* %tmp1561, i64 1
+  %tmp1563 = getelementptr inbounds float* %tmp1562, i64 1
+  %tmp1564 = getelementptr inbounds float* %tmp1563, i64 1
+  %tmp1565 = getelementptr inbounds float* %tmp1564, i64 1
+  %tmp1566 = getelementptr inbounds float* %tmp1565, i64 1
+  %tmp1567 = getelementptr inbounds float* %tmp1566, i64 1
+  %tmp1568 = getelementptr inbounds float* %tmp1567, i64 1
+  %tmp1569 = getelementptr inbounds float* %tmp1568, i64 1
+  %tmp1570 = getelementptr inbounds float* %tmp1569, i64 1
+  %tmp1571 = getelementptr inbounds float* %tmp1570, i64 1
+  %tmp1572 = getelementptr inbounds float* %tmp1571, i64 1
+  %tmp1573 = getelementptr inbounds float* %tmp1572, i64 1
+  %tmp1574 = getelementptr inbounds float* %tmp1573, i64 1
+  %tmp1575 = getelementptr inbounds float* %tmp1574, i64 1
+  %tmp1576 = getelementptr inbounds float* %tmp1575, i64 1
+  %tmp1577 = getelementptr inbounds float* %tmp1576, i64 1
+  %tmp1578 = getelementptr inbounds float* %tmp1577, i64 1
+  %tmp1579 = getelementptr inbounds float* %tmp1578, i64 1
+  %tmp1580 = getelementptr inbounds float* %tmp1579, i64 1
+  %tmp1581 = getelementptr inbounds float* %tmp1580, i64 1
+  %tmp1582 = getelementptr inbounds float* %tmp1581, i64 1
+  %tmp1583 = getelementptr inbounds float* %tmp1582, i64 1
+  %tmp1584 = getelementptr inbounds float* %tmp1583, i64 1
+  %tmp1585 = getelementptr inbounds float* %tmp1584, i64 1
+  %tmp1586 = getelementptr inbounds float* %tmp1585, i64 1
+  %tmp1587 = getelementptr inbounds float* %tmp1586, i64 1
+  %tmp1588 = getelementptr inbounds float* %tmp1587, i64 1
+  %tmp1589 = getelementptr inbounds float* %tmp1588, i64 1
+  %tmp1590 = getelementptr inbounds float* %tmp1589, i64 1
+  %tmp1591 = getelementptr inbounds float* %tmp1590, i64 1
+  %tmp1592 = getelementptr inbounds float* %tmp1591, i64 1
+  %tmp1593 = getelementptr inbounds float* %tmp1592, i64 1
+  %tmp1594 = getelementptr inbounds float* %tmp1593, i64 1
+  %tmp1595 = getelementptr inbounds float* %tmp1594, i64 1
+  %tmp1596 = getelementptr inbounds float* %tmp1595, i64 1
+  %tmp1597 = getelementptr inbounds float* %tmp1596, i64 1
+  %tmp1598 = getelementptr inbounds float* %tmp1597, i64 1
+  %tmp1599 = getelementptr inbounds float* %tmp1598, i64 1
+  %tmp1600 = getelementptr inbounds float* %tmp1599, i64 1
+  %tmp1601 = getelementptr inbounds float* %tmp1600, i64 1
+  %tmp1602 = getelementptr inbounds float* %tmp1601, i64 1
+  %tmp1603 = getelementptr inbounds float* %tmp1602, i64 1
+  %tmp1604 = getelementptr inbounds float* %tmp1603, i64 1
+  %tmp1605 = getelementptr inbounds float* %tmp1604, i64 1
+  %tmp1606 = getelementptr inbounds float* %tmp1605, i64 1
+  %tmp1607 = getelementptr inbounds float* %tmp1606, i64 1
+  %tmp1608 = getelementptr inbounds float* %tmp1607, i64 1
+  %tmp1609 = getelementptr inbounds float* %tmp1608, i64 1
+  %tmp1610 = getelementptr inbounds float* %tmp1609, i64 1
+  %tmp1611 = getelementptr inbounds float* %tmp1610, i64 1
+  %tmp1612 = getelementptr inbounds float* %tmp1611, i64 1
+  %tmp1613 = getelementptr inbounds float* %tmp1612, i64 1
+  %tmp1614 = getelementptr inbounds float* %tmp1613, i64 1
+  %tmp1615 = getelementptr inbounds float* %tmp1614, i64 1
+  %tmp1616 = getelementptr inbounds float* %tmp1615, i64 1
+  %tmp1617 = getelementptr inbounds float* %tmp1616, i64 1
+  %tmp1618 = getelementptr inbounds float* %tmp1617, i64 1
+  %tmp1619 = getelementptr inbounds float* %tmp1618, i64 1
+  %tmp1620 = getelementptr inbounds float* %tmp1619, i64 1
+  %tmp1621 = getelementptr inbounds float* %tmp1620, i64 1
+  %tmp1622 = getelementptr inbounds float* %tmp1621, i64 1
+  %tmp1623 = getelementptr inbounds float* %tmp1622, i64 1
+  %tmp1624 = getelementptr inbounds float* %tmp1623, i64 1
+  %tmp1625 = getelementptr inbounds float* %tmp1624, i64 1
+  %tmp1626 = getelementptr inbounds float* %tmp1625, i64 1
+  %tmp1627 = getelementptr inbounds float* %tmp1626, i64 1
+  %tmp1628 = getelementptr inbounds float* %tmp1627, i64 1
+  %tmp1629 = getelementptr inbounds float* %tmp1628, i64 1
+  %tmp1630 = getelementptr inbounds float* %tmp1629, i64 1
+  %tmp1631 = getelementptr inbounds float* %tmp1630, i64 1
+  %tmp1632 = getelementptr inbounds float* %tmp1631, i64 1
+  %tmp1633 = getelementptr inbounds float* %tmp1632, i64 1
+  %tmp1634 = getelementptr inbounds float* %tmp1633, i64 1
+  %tmp1635 = getelementptr inbounds float* %tmp1634, i64 1
+  %tmp1636 = getelementptr inbounds float* %tmp1635, i64 1
+  %tmp1637 = getelementptr inbounds float* %tmp1636, i64 1
+  %tmp1638 = getelementptr inbounds float* %tmp1637, i64 1
+  %tmp1639 = getelementptr inbounds float* %tmp1638, i64 1
+  %tmp1640 = getelementptr inbounds float* %tmp1639, i64 1
+  %tmp1641 = getelementptr inbounds float* %tmp1640, i64 1
+  %tmp1642 = getelementptr inbounds float* %tmp1641, i64 1
+  %tmp1643 = getelementptr inbounds float* %tmp1642, i64 1
+  %tmp1644 = getelementptr inbounds float* %tmp1643, i64 1
+  %tmp1645 = getelementptr inbounds float* %tmp1644, i64 1
+  %tmp1646 = getelementptr inbounds float* %tmp1645, i64 1
+  %tmp1647 = getelementptr inbounds float* %tmp1646, i64 1
+  %tmp1648 = getelementptr inbounds float* %tmp1647, i64 1
+  %tmp1649 = getelementptr inbounds float* %tmp1648, i64 1
+  %tmp1650 = getelementptr inbounds float* %tmp1649, i64 1
+  %tmp1651 = getelementptr inbounds float* %tmp1650, i64 1
+  %tmp1652 = getelementptr inbounds float* %tmp1651, i64 1
+  %tmp1653 = getelementptr inbounds float* %tmp1652, i64 1
+  %tmp1654 = getelementptr inbounds float* %tmp1653, i64 1
+  %tmp1655 = getelementptr inbounds float* %tmp1654, i64 1
+  %tmp1656 = getelementptr inbounds float* %tmp1655, i64 1
+  %tmp1657 = getelementptr inbounds float* %tmp1656, i64 1
+  %tmp1658 = getelementptr inbounds float* %tmp1657, i64 1
+  %tmp1659 = getelementptr inbounds float* %tmp1658, i64 1
+  %tmp1660 = getelementptr inbounds float* %tmp1659, i64 1
+  %tmp1661 = getelementptr inbounds float* %tmp1660, i64 1
+  %tmp1662 = getelementptr inbounds float* %tmp1661, i64 1
+  %tmp1663 = getelementptr inbounds float* %tmp1662, i64 1
+  %tmp1664 = getelementptr inbounds float* %tmp1663, i64 1
+  %tmp1665 = getelementptr inbounds float* %tmp1664, i64 1
+  %tmp1666 = getelementptr inbounds float* %tmp1665, i64 1
+  %tmp1667 = getelementptr inbounds float* %tmp1666, i64 1
+  %tmp1668 = getelementptr inbounds float* %tmp1667, i64 1
+  %tmp1669 = getelementptr inbounds float* %tmp1668, i64 1
+  %tmp1670 = getelementptr inbounds float* %tmp1669, i64 1
+  %tmp1671 = getelementptr inbounds float* %tmp1670, i64 1
+  %tmp1672 = getelementptr inbounds float* %tmp1671, i64 1
+  %tmp1673 = getelementptr inbounds float* %tmp1672, i64 1
+  %tmp1674 = getelementptr inbounds float* %tmp1673, i64 1
+  %tmp1675 = getelementptr inbounds float* %tmp1674, i64 1
+  %tmp1676 = getelementptr inbounds float* %tmp1675, i64 1
+  %tmp1677 = getelementptr inbounds float* %tmp1676, i64 1
+  %tmp1678 = getelementptr inbounds float* %tmp1677, i64 1
+  %tmp1679 = getelementptr inbounds float* %tmp1678, i64 1
+  %tmp1680 = getelementptr inbounds float* %tmp1679, i64 1
+  %tmp1681 = getelementptr inbounds float* %tmp1680, i64 1
+  %tmp1682 = getelementptr inbounds float* %tmp1681, i64 1
+  %tmp1683 = getelementptr inbounds float* %tmp1682, i64 1
+  %tmp1684 = getelementptr inbounds float* %tmp1683, i64 1
+  %tmp1685 = getelementptr inbounds float* %tmp1684, i64 1
+  %tmp1686 = getelementptr inbounds float* %tmp1685, i64 1
+  %tmp1687 = getelementptr inbounds float* %tmp1686, i64 1
+  %tmp1688 = getelementptr inbounds float* %tmp1687, i64 1
+  %tmp1689 = getelementptr inbounds float* %tmp1688, i64 1
+  %tmp1690 = getelementptr inbounds float* %tmp1689, i64 1
+  %tmp1691 = getelementptr inbounds float* %tmp1690, i64 1
+  %tmp1692 = getelementptr inbounds float* %tmp1691, i64 1
+  %tmp1693 = getelementptr inbounds float* %tmp1692, i64 1
+  %tmp1694 = getelementptr inbounds float* %tmp1693, i64 1
+  %tmp1695 = getelementptr inbounds float* %tmp1694, i64 1
+  %tmp1696 = getelementptr inbounds float* %tmp1695, i64 1
+  %tmp1697 = getelementptr inbounds float* %tmp1696, i64 1
+  %tmp1698 = getelementptr inbounds float* %tmp1697, i64 1
+  %tmp1699 = getelementptr inbounds float* %tmp1698, i64 1
+  %tmp1700 = getelementptr inbounds float* %tmp1699, i64 1
+  %tmp1701 = getelementptr inbounds float* %tmp1700, i64 1
+  %tmp1702 = getelementptr inbounds float* %tmp1701, i64 1
+  %tmp1703 = getelementptr inbounds float* %tmp1702, i64 1
+  %tmp1704 = getelementptr inbounds float* %tmp1703, i64 1
+  %tmp1705 = getelementptr inbounds float* %tmp1704, i64 1
+  %tmp1706 = getelementptr inbounds float* %tmp1705, i64 1
+  %tmp1707 = getelementptr inbounds float* %tmp1706, i64 1
+  %tmp1708 = getelementptr inbounds float* %tmp1707, i64 1
+  %tmp1709 = getelementptr inbounds float* %tmp1708, i64 1
+  %tmp1710 = getelementptr inbounds float* %tmp1709, i64 1
+  %tmp1711 = getelementptr inbounds float* %tmp1710, i64 1
+  %tmp1712 = getelementptr inbounds float* %tmp1711, i64 1
+  %tmp1713 = getelementptr inbounds float* %tmp1712, i64 1
+  %tmp1714 = getelementptr inbounds float* %tmp1713, i64 1
+  %tmp1715 = getelementptr inbounds float* %tmp1714, i64 1
+  %tmp1716 = getelementptr inbounds float* %tmp1715, i64 1
+  %tmp1717 = getelementptr inbounds float* %tmp1716, i64 1
+  %tmp1718 = getelementptr inbounds float* %tmp1717, i64 1
+  %tmp1719 = getelementptr inbounds float* %tmp1718, i64 1
+  %tmp1720 = getelementptr inbounds float* %tmp1719, i64 1
+  %tmp1721 = getelementptr inbounds float* %tmp1720, i64 1
+  %tmp1722 = getelementptr inbounds float* %tmp1721, i64 1
+  %tmp1723 = getelementptr inbounds float* %tmp1722, i64 1
+  %tmp1724 = getelementptr inbounds float* %tmp1723, i64 1
+  %tmp1725 = getelementptr inbounds float* %tmp1724, i64 1
+  %tmp1726 = getelementptr inbounds float* %tmp1725, i64 1
+  %tmp1727 = getelementptr inbounds float* %tmp1726, i64 1
+  %tmp1728 = getelementptr inbounds float* %tmp1727, i64 1
+  %tmp1729 = getelementptr inbounds float* %tmp1728, i64 1
+  %tmp1730 = getelementptr inbounds float* %tmp1729, i64 1
+  %tmp1731 = getelementptr inbounds float* %tmp1730, i64 1
+  %tmp1732 = getelementptr inbounds float* %tmp1731, i64 1
+  %tmp1733 = getelementptr inbounds float* %tmp1732, i64 1
+  %tmp1734 = getelementptr inbounds float* %tmp1733, i64 1
+  %tmp1735 = getelementptr inbounds float* %tmp1734, i64 1
+  %tmp1736 = getelementptr inbounds float* %tmp1735, i64 1
+  %tmp1737 = getelementptr inbounds float* %tmp1736, i64 1
+  %tmp1738 = getelementptr inbounds float* %tmp1737, i64 1
+  %tmp1739 = getelementptr inbounds float* %tmp1738, i64 1
+  %tmp1740 = getelementptr inbounds float* %tmp1739, i64 1
+  %tmp1741 = getelementptr inbounds float* %tmp1740, i64 1
+  %tmp1742 = getelementptr inbounds float* %tmp1741, i64 1
+  %tmp1743 = getelementptr inbounds float* %tmp1742, i64 1
+  %tmp1744 = getelementptr inbounds float* %tmp1743, i64 1
+  %tmp1745 = getelementptr inbounds float* %tmp1744, i64 1
+  %tmp1746 = getelementptr inbounds float* %tmp1745, i64 1
+  %tmp1747 = getelementptr inbounds float* %tmp1746, i64 1
+  %tmp1748 = getelementptr inbounds float* %tmp1747, i64 1
+  %tmp1749 = getelementptr inbounds float* %tmp1748, i64 1
+  %tmp1750 = getelementptr inbounds float* %tmp1749, i64 1
+  %tmp1751 = getelementptr inbounds float* %tmp1750, i64 1
+  %tmp1752 = getelementptr inbounds float* %tmp1751, i64 1
+  %tmp1753 = getelementptr inbounds float* %tmp1752, i64 1
+  %tmp1754 = getelementptr inbounds float* %tmp1753, i64 1
+  %tmp1755 = getelementptr inbounds float* %tmp1754, i64 1
+  %tmp1756 = getelementptr inbounds float* %tmp1755, i64 1
+  %tmp1757 = getelementptr inbounds float* %tmp1756, i64 1
+  %tmp1758 = getelementptr inbounds float* %tmp1757, i64 1
+  %tmp1759 = getelementptr inbounds float* %tmp1758, i64 1
+  %tmp1760 = getelementptr inbounds float* %tmp1759, i64 1
+  %tmp1761 = getelementptr inbounds float* %tmp1760, i64 1
+  %tmp1762 = getelementptr inbounds float* %tmp1761, i64 1
+  %tmp1763 = getelementptr inbounds float* %tmp1762, i64 1
+  %tmp1764 = getelementptr inbounds float* %tmp1763, i64 1
+  %tmp1765 = getelementptr inbounds float* %tmp1764, i64 1
+  %tmp1766 = getelementptr inbounds float* %tmp1765, i64 1
+  %tmp1767 = getelementptr inbounds float* %tmp1766, i64 1
+  %tmp1768 = getelementptr inbounds float* %tmp1767, i64 1
+  %tmp1769 = getelementptr inbounds float* %tmp1768, i64 1
+  %tmp1770 = getelementptr inbounds float* %tmp1769, i64 1
+  %tmp1771 = getelementptr inbounds float* %tmp1770, i64 1
+  %tmp1772 = getelementptr inbounds float* %tmp1771, i64 1
+  %tmp1773 = getelementptr inbounds float* %tmp1772, i64 1
+  %tmp1774 = getelementptr inbounds float* %tmp1773, i64 1
+  %tmp1775 = getelementptr inbounds float* %tmp1774, i64 1
+  %tmp1776 = getelementptr inbounds float* %tmp1775, i64 1
+  %tmp1777 = getelementptr inbounds float* %tmp1776, i64 1
+  %tmp1778 = getelementptr inbounds float* %tmp1777, i64 1
+  %tmp1779 = getelementptr inbounds float* %tmp1778, i64 1
+  %tmp1780 = getelementptr inbounds float* %tmp1779, i64 1
+  %tmp1781 = getelementptr inbounds float* %tmp1780, i64 1
+  %tmp1782 = getelementptr inbounds float* %tmp1781, i64 1
+  %tmp1783 = getelementptr inbounds float* %tmp1782, i64 1
+  %tmp1784 = getelementptr inbounds float* %tmp1783, i64 1
+  %tmp1785 = getelementptr inbounds float* %tmp1784, i64 1
+  %tmp1786 = getelementptr inbounds float* %tmp1785, i64 1
+  %tmp1787 = getelementptr inbounds float* %tmp1786, i64 1
+  %tmp1788 = getelementptr inbounds float* %tmp1787, i64 1
+  %tmp1789 = getelementptr inbounds float* %tmp1788, i64 1
+  %tmp1790 = getelementptr inbounds float* %tmp1789, i64 1
+  %tmp1791 = getelementptr inbounds float* %tmp1790, i64 1
+  %tmp1792 = getelementptr inbounds float* %tmp1791, i64 1
+  %tmp1793 = getelementptr inbounds float* %tmp1792, i64 1
+  %tmp1794 = getelementptr inbounds float* %tmp1793, i64 1
+  %tmp1795 = getelementptr inbounds float* %tmp1794, i64 1
+  %tmp1796 = getelementptr inbounds float* %tmp1795, i64 1
+  %tmp1797 = getelementptr inbounds float* %tmp1796, i64 1
+  %tmp1798 = getelementptr inbounds float* %tmp1797, i64 1
+  %tmp1799 = getelementptr inbounds float* %tmp1798, i64 1
+  %tmp1800 = getelementptr inbounds float* %tmp1799, i64 1
+  %tmp1801 = getelementptr inbounds float* %tmp1800, i64 1
+  %tmp1802 = getelementptr inbounds float* %tmp1801, i64 1
+  %tmp1803 = getelementptr inbounds float* %tmp1802, i64 1
+  %tmp1804 = getelementptr inbounds float* %tmp1803, i64 1
+  %tmp1805 = getelementptr inbounds float* %tmp1804, i64 1
+  %tmp1806 = getelementptr inbounds float* %tmp1805, i64 1
+  %tmp1807 = getelementptr inbounds float* %tmp1806, i64 1
+  %tmp1808 = getelementptr inbounds float* %tmp1807, i64 1
+  %tmp1809 = getelementptr inbounds float* %tmp1808, i64 1
+  %tmp1810 = getelementptr inbounds float* %tmp1809, i64 1
+  %tmp1811 = getelementptr inbounds float* %tmp1810, i64 1
+  %tmp1812 = getelementptr inbounds float* %tmp1811, i64 1
+  %tmp1813 = getelementptr inbounds float* %tmp1812, i64 1
+  %tmp1814 = getelementptr inbounds float* %tmp1813, i64 1
+  %tmp1815 = getelementptr inbounds float* %tmp1814, i64 1
+  %tmp1816 = getelementptr inbounds float* %tmp1815, i64 1
+  %tmp1817 = getelementptr inbounds float* %tmp1816, i64 1
+  %tmp1818 = getelementptr inbounds float* %tmp1817, i64 1
+  %tmp1819 = getelementptr inbounds float* %tmp1818, i64 1
+  %tmp1820 = getelementptr inbounds float* %tmp1819, i64 1
+  %tmp1821 = getelementptr inbounds float* %tmp1820, i64 1
+  %tmp1822 = getelementptr inbounds float* %tmp1821, i64 1
+  %tmp1823 = getelementptr inbounds float* %tmp1822, i64 1
+  %tmp1824 = getelementptr inbounds float* %tmp1823, i64 1
+  %tmp1825 = getelementptr inbounds float* %tmp1824, i64 1
+  %tmp1826 = getelementptr inbounds float* %tmp1825, i64 1
+  %tmp1827 = getelementptr inbounds float* %tmp1826, i64 1
+  %tmp1828 = getelementptr inbounds float* %tmp1827, i64 1
+  %tmp1829 = getelementptr inbounds float* %tmp1828, i64 1
+  %tmp1830 = getelementptr inbounds float* %tmp1829, i64 1
+  %tmp1831 = getelementptr inbounds float* %tmp1830, i64 1
+  %tmp1832 = getelementptr inbounds float* %tmp1831, i64 1
+  %tmp1833 = getelementptr inbounds float* %tmp1832, i64 1
+  %tmp1834 = getelementptr inbounds float* %tmp1833, i64 1
+  %tmp1835 = getelementptr inbounds float* %tmp1834, i64 1
+  %tmp1836 = getelementptr inbounds float* %tmp1835, i64 1
+  %tmp1837 = getelementptr inbounds float* %tmp1836, i64 1
+  %tmp1838 = getelementptr inbounds float* %tmp1837, i64 1
+  %tmp1839 = getelementptr inbounds float* %tmp1838, i64 1
+  %tmp1840 = getelementptr inbounds float* %tmp1839, i64 1
+  %tmp1841 = getelementptr inbounds float* %tmp1840, i64 1
+  %tmp1842 = getelementptr inbounds float* %tmp1841, i64 1
+  %tmp1843 = getelementptr inbounds float* %tmp1842, i64 1
+  %tmp1844 = getelementptr inbounds float* %tmp1843, i64 1
+  %tmp1845 = getelementptr inbounds float* %tmp1844, i64 1
+  %tmp1846 = getelementptr inbounds float* %tmp1845, i64 1
+  %tmp1847 = getelementptr inbounds float* %tmp1846, i64 1
+  %tmp1848 = getelementptr inbounds float* %tmp1847, i64 1
+  %tmp1849 = getelementptr inbounds float* %tmp1848, i64 1
+  %tmp1850 = getelementptr inbounds float* %tmp1849, i64 1
+  %tmp1851 = getelementptr inbounds float* %tmp1850, i64 1
+  %tmp1852 = getelementptr inbounds float* %tmp1851, i64 1
+  %tmp1853 = getelementptr inbounds float* %tmp1852, i64 1
+  %tmp1854 = getelementptr inbounds float* %tmp1853, i64 1
+  %tmp1855 = getelementptr inbounds float* %tmp1854, i64 1
+  %tmp1856 = getelementptr inbounds float* %tmp1855, i64 1
+  %tmp1857 = getelementptr inbounds float* %tmp1856, i64 1
+  %tmp1858 = getelementptr inbounds float* %tmp1857, i64 1
+  %tmp1859 = getelementptr inbounds float* %tmp1858, i64 1
+  %tmp1860 = getelementptr inbounds float* %tmp1859, i64 1
+  %tmp1861 = getelementptr inbounds float* %tmp1860, i64 1
+  %tmp1862 = getelementptr inbounds float* %tmp1861, i64 1
+  %tmp1863 = getelementptr inbounds float* %tmp1862, i64 1
+  %tmp1864 = getelementptr inbounds float* %tmp1863, i64 1
+  %tmp1865 = getelementptr inbounds float* %tmp1864, i64 1
+  %tmp1866 = getelementptr inbounds float* %tmp1865, i64 1
+  %tmp1867 = getelementptr inbounds float* %tmp1866, i64 1
+  %tmp1868 = getelementptr inbounds float* %tmp1867, i64 1
+  %tmp1869 = getelementptr inbounds float* %tmp1868, i64 1
+  %tmp1870 = getelementptr inbounds float* %tmp1869, i64 1
+  %tmp1871 = getelementptr inbounds float* %tmp1870, i64 1
+  %tmp1872 = getelementptr inbounds float* %tmp1871, i64 1
+  %tmp1873 = getelementptr inbounds float* %tmp1872, i64 1
+  %tmp1874 = getelementptr inbounds float* %tmp1873, i64 1
+  %tmp1875 = getelementptr inbounds float* %tmp1874, i64 1
+  %tmp1876 = getelementptr inbounds float* %tmp1875, i64 1
+  %tmp1877 = getelementptr inbounds float* %tmp1876, i64 1
+  %tmp1878 = getelementptr inbounds float* %tmp1877, i64 1
+  %tmp1879 = getelementptr inbounds float* %tmp1878, i64 1
+  %tmp1880 = getelementptr inbounds float* %tmp1879, i64 1
+  %tmp1881 = getelementptr inbounds float* %tmp1880, i64 1
+  %tmp1882 = getelementptr inbounds float* %tmp1881, i64 1
+  %tmp1883 = getelementptr inbounds float* %tmp1882, i64 1
+  %tmp1884 = getelementptr inbounds float* %tmp1883, i64 1
+  %tmp1885 = getelementptr inbounds float* %tmp1884, i64 1
+  %tmp1886 = getelementptr inbounds float* %tmp1885, i64 1
+  %tmp1887 = getelementptr inbounds float* %tmp1886, i64 1
+  %tmp1888 = getelementptr inbounds float* %tmp1887, i64 1
+  %tmp1889 = getelementptr inbounds float* %tmp1888, i64 1
+  %tmp1890 = getelementptr inbounds float* %tmp1889, i64 1
+  %tmp1891 = getelementptr inbounds float* %tmp1890, i64 1
+  %tmp1892 = getelementptr inbounds float* %tmp1891, i64 1
+  %tmp1893 = getelementptr inbounds float* %tmp1892, i64 1
+  %tmp1894 = getelementptr inbounds float* %tmp1893, i64 1
+  %tmp1895 = getelementptr inbounds float* %tmp1894, i64 1
+  %tmp1896 = getelementptr inbounds float* %tmp1895, i64 1
+  %tmp1897 = getelementptr inbounds float* %tmp1896, i64 1
+  %tmp1898 = getelementptr inbounds float* %tmp1897, i64 1
+  %tmp1899 = getelementptr inbounds float* %tmp1898, i64 1
+  %tmp1900 = getelementptr inbounds float* %tmp1899, i64 1
+  %tmp1901 = getelementptr inbounds float* %tmp1900, i64 1
+  %tmp1902 = getelementptr inbounds float* %tmp1901, i64 1
+  %tmp1903 = getelementptr inbounds float* %tmp1902, i64 1
+  %tmp1904 = getelementptr inbounds float* %tmp1903, i64 1
+  %tmp1905 = getelementptr inbounds float* %tmp1904, i64 1
+  %tmp1906 = getelementptr inbounds float* %tmp1905, i64 1
+  %tmp1907 = getelementptr inbounds float* %tmp1906, i64 1
+  %tmp1908 = getelementptr inbounds float* %tmp1907, i64 1
+  %tmp1909 = getelementptr inbounds float* %tmp1908, i64 1
+  %tmp1910 = getelementptr inbounds float* %tmp1909, i64 1
+  %tmp1911 = getelementptr inbounds float* %tmp1910, i64 1
+  %tmp1912 = getelementptr inbounds float* %tmp1911, i64 1
+  %tmp1913 = getelementptr inbounds float* %tmp1912, i64 1
+  %tmp1914 = getelementptr inbounds float* %tmp1913, i64 1
+  %tmp1915 = getelementptr inbounds float* %tmp1914, i64 1
+  %tmp1916 = getelementptr inbounds float* %tmp1915, i64 1
+  %tmp1917 = getelementptr inbounds float* %tmp1916, i64 1
+  %tmp1918 = getelementptr inbounds float* %tmp1917, i64 1
+  %tmp1919 = getelementptr inbounds float* %tmp1918, i64 1
+  %tmp1920 = getelementptr inbounds float* %tmp1919, i64 1
+  %tmp1921 = getelementptr inbounds float* %tmp1920, i64 1
+  %tmp1922 = getelementptr inbounds float* %tmp1921, i64 1
+  %tmp1923 = getelementptr inbounds float* %tmp1922, i64 1
+  %tmp1924 = getelementptr inbounds float* %tmp1923, i64 1
+  %tmp1925 = getelementptr inbounds float* %tmp1924, i64 1
+  %tmp1926 = getelementptr inbounds float* %tmp1925, i64 1
+  %tmp1927 = getelementptr inbounds float* %tmp1926, i64 1
+  %tmp1928 = getelementptr inbounds float* %tmp1927, i64 1
+  %tmp1929 = getelementptr inbounds float* %tmp1928, i64 1
+  %tmp1930 = getelementptr inbounds float* %tmp1929, i64 1
+  %tmp1931 = getelementptr inbounds float* %tmp1930, i64 1
+  %tmp1932 = getelementptr inbounds float* %tmp1931, i64 1
+  %tmp1933 = getelementptr inbounds float* %tmp1932, i64 1
+  %tmp1934 = getelementptr inbounds float* %tmp1933, i64 1
+  %tmp1935 = getelementptr inbounds float* %tmp1934, i64 1
+  %tmp1936 = getelementptr inbounds float* %tmp1935, i64 1
+  %tmp1937 = getelementptr inbounds float* %tmp1936, i64 1
+  %tmp1938 = getelementptr inbounds float* %tmp1937, i64 1
+  %tmp1939 = getelementptr inbounds float* %tmp1938, i64 1
+  %tmp1940 = getelementptr inbounds float* %tmp1939, i64 1
+  %tmp1941 = getelementptr inbounds float* %tmp1940, i64 1
+  %tmp1942 = getelementptr inbounds float* %tmp1941, i64 1
+  %tmp1943 = getelementptr inbounds float* %tmp1942, i64 1
+  %tmp1944 = getelementptr inbounds float* %tmp1943, i64 1
+  %tmp1945 = getelementptr inbounds float* %tmp1944, i64 1
+  %tmp1946 = getelementptr inbounds float* %tmp1945, i64 1
+  %tmp1947 = getelementptr inbounds float* %tmp1946, i64 1
+  %tmp1948 = getelementptr inbounds float* %tmp1947, i64 1
+  %tmp1949 = getelementptr inbounds float* %tmp1948, i64 1
+  %tmp1950 = getelementptr inbounds float* %tmp1949, i64 1
+  %tmp1951 = getelementptr inbounds float* %tmp1950, i64 1
+  %tmp1952 = getelementptr inbounds float* %tmp1951, i64 1
+  %tmp1953 = getelementptr inbounds float* %tmp1952, i64 1
+  %tmp1954 = getelementptr inbounds float* %tmp1953, i64 1
+  %tmp1955 = getelementptr inbounds float* %tmp1954, i64 1
+  %tmp1956 = getelementptr inbounds float* %tmp1955, i64 1
+  %tmp1957 = getelementptr inbounds float* %tmp1956, i64 1
+  %tmp1958 = getelementptr inbounds float* %tmp1957, i64 1
+  %tmp1959 = getelementptr inbounds float* %tmp1958, i64 1
+  %tmp1960 = getelementptr inbounds float* %tmp1959, i64 1
+  %tmp1961 = getelementptr inbounds float* %tmp1960, i64 1
+  %tmp1962 = getelementptr inbounds float* %tmp1961, i64 1
+  %tmp1963 = getelementptr inbounds float* %tmp1962, i64 1
+  %tmp1964 = getelementptr inbounds float* %tmp1963, i64 1
+  %tmp1965 = getelementptr inbounds float* %tmp1964, i64 1
+  %tmp1966 = getelementptr inbounds float* %tmp1965, i64 1
+  %tmp1967 = getelementptr inbounds float* %tmp1966, i64 1
+  %tmp1968 = getelementptr inbounds float* %tmp1967, i64 1
+  %tmp1969 = getelementptr inbounds float* %tmp1968, i64 1
+  %tmp1970 = getelementptr inbounds float* %tmp1969, i64 1
+  %tmp1971 = getelementptr inbounds float* %tmp1970, i64 1
+  %tmp1972 = getelementptr inbounds float* %tmp1971, i64 1
+  %tmp1973 = getelementptr inbounds float* %tmp1972, i64 1
+  %tmp1974 = getelementptr inbounds float* %tmp1973, i64 1
+  %tmp1975 = getelementptr inbounds float* %tmp1974, i64 1
+  %tmp1976 = getelementptr inbounds float* %tmp1975, i64 1
+  %tmp1977 = getelementptr inbounds float* %tmp1976, i64 1
+  %tmp1978 = getelementptr inbounds float* %tmp1977, i64 1
+  %tmp1979 = getelementptr inbounds float* %tmp1978, i64 1
+  %tmp1980 = getelementptr inbounds float* %tmp1979, i64 1
+  %tmp1981 = getelementptr inbounds float* %tmp1980, i64 1
+  %tmp1982 = getelementptr inbounds float* %tmp1981, i64 1
+  %tmp1983 = getelementptr inbounds float* %tmp1982, i64 1
+  %tmp1984 = getelementptr inbounds float* %tmp1983, i64 1
+  %tmp1985 = getelementptr inbounds float* %tmp1984, i64 1
+  %tmp1986 = getelementptr inbounds float* %tmp1985, i64 1
+  %tmp1987 = getelementptr inbounds float* %tmp1986, i64 1
+  %tmp1988 = getelementptr inbounds float* %tmp1987, i64 1
+  %tmp1989 = getelementptr inbounds float* %tmp1988, i64 1
+  %tmp1990 = getelementptr inbounds float* %tmp1989, i64 1
+  %tmp1991 = getelementptr inbounds float* %tmp1990, i64 1
+  %tmp1992 = getelementptr inbounds float* %tmp1991, i64 1
+  %tmp1993 = getelementptr inbounds float* %tmp1992, i64 1
+  %tmp1994 = getelementptr inbounds float* %tmp1993, i64 1
+  %tmp1995 = getelementptr inbounds float* %tmp1994, i64 1
+  %tmp1996 = getelementptr inbounds float* %tmp1995, i64 1
+  %tmp1997 = getelementptr inbounds float* %tmp1996, i64 1
+  %tmp1998 = getelementptr inbounds float* %tmp1997, i64 1
+  %tmp1999 = getelementptr inbounds float* %tmp1998, i64 1
+  %tmp2000 = getelementptr inbounds float* %tmp1999, i64 1
+  %tmp2001 = getelementptr inbounds float* %tmp2000, i64 1
+  %tmp2002 = getelementptr inbounds float* %tmp2001, i64 1
+  %tmp2003 = getelementptr inbounds float* %tmp2002, i64 1
+  %tmp2004 = getelementptr inbounds float* %tmp2003, i64 1
+  %tmp2005 = getelementptr inbounds float* %tmp2004, i64 1
+  %tmp2006 = getelementptr inbounds float* %tmp2005, i64 1
+  %tmp2007 = getelementptr inbounds float* %tmp2006, i64 1
+  %tmp2008 = getelementptr inbounds float* %tmp2007, i64 1
+  %tmp2009 = getelementptr inbounds float* %tmp2008, i64 1
+  %tmp2010 = getelementptr inbounds float* %tmp2009, i64 1
+  %tmp2011 = getelementptr inbounds float* %tmp2010, i64 1
+  %tmp2012 = getelementptr inbounds float* %tmp2011, i64 1
+  %tmp2013 = getelementptr inbounds float* %tmp2012, i64 1
+  %tmp2014 = getelementptr inbounds float* %tmp2013, i64 1
+  %tmp2015 = getelementptr inbounds float* %tmp2014, i64 1
+  %tmp2016 = getelementptr inbounds float* %tmp2015, i64 1
+  %tmp2017 = getelementptr inbounds float* %tmp2016, i64 1
+  %tmp2018 = getelementptr inbounds float* %tmp2017, i64 1
+  %tmp2019 = getelementptr inbounds float* %tmp2018, i64 1
+  %tmp2020 = getelementptr inbounds float* %tmp2019, i64 1
+  %tmp2021 = getelementptr inbounds float* %tmp2020, i64 1
+  %tmp2022 = getelementptr inbounds float* %tmp2021, i64 1
+  %tmp2023 = getelementptr inbounds float* %tmp2022, i64 1
+  %tmp2024 = getelementptr inbounds float* %tmp2023, i64 1
+  %tmp2025 = getelementptr inbounds float* %tmp2024, i64 1
+  %tmp2026 = getelementptr inbounds float* %tmp2025, i64 1
+  %tmp2027 = getelementptr inbounds float* %tmp2026, i64 1
+  %tmp2028 = getelementptr inbounds float* %tmp2027, i64 1
+  %tmp2029 = getelementptr inbounds float* %tmp2028, i64 1
+  %tmp2030 = getelementptr inbounds float* %tmp2029, i64 1
+  %tmp2031 = getelementptr inbounds float* %tmp2030, i64 1
+  %tmp2032 = getelementptr inbounds float* %tmp2031, i64 1
+  %tmp2033 = getelementptr inbounds float* %tmp2032, i64 1
+  %tmp2034 = getelementptr inbounds float* %tmp2033, i64 1
+  %tmp2035 = getelementptr inbounds float* %tmp2034, i64 1
+  %tmp2036 = getelementptr inbounds float* %tmp2035, i64 1
+  %tmp2037 = getelementptr inbounds float* %tmp2036, i64 1
+  %tmp2038 = getelementptr inbounds float* %tmp2037, i64 1
+  %tmp2039 = getelementptr inbounds float* %tmp2038, i64 1
+  %tmp2040 = getelementptr inbounds float* %tmp2039, i64 1
+  %tmp2041 = getelementptr inbounds float* %tmp2040, i64 1
+  %tmp2042 = getelementptr inbounds float* %tmp2041, i64 1
+  %tmp2043 = getelementptr inbounds float* %tmp2042, i64 1
+  %tmp2044 = getelementptr inbounds float* %tmp2043, i64 1
+  %tmp2045 = getelementptr inbounds float* %tmp2044, i64 1
+  %tmp2046 = getelementptr inbounds float* %tmp2045, i64 1
+  %tmp2047 = getelementptr inbounds float* %tmp2046, i64 1
+  %tmp2048 = getelementptr inbounds float* %tmp2047, i64 1
+  %tmp2049 = getelementptr inbounds float* %tmp2048, i64 1
+  %tmp2050 = getelementptr inbounds float* %tmp2049, i64 1
+  %tmp2051 = getelementptr inbounds float* %tmp2050, i64 1
+  %tmp2052 = getelementptr inbounds float* %tmp2051, i64 1
+  %tmp2053 = getelementptr inbounds float* %tmp2052, i64 1
+  %tmp2054 = getelementptr inbounds float* %tmp2053, i64 1
+  %tmp2055 = getelementptr inbounds float* %tmp2054, i64 1
+  %tmp2056 = getelementptr inbounds float* %tmp2055, i64 1
+  %tmp2057 = getelementptr inbounds float* %tmp2056, i64 1
+  %tmp2058 = getelementptr inbounds float* %tmp2057, i64 1
+  %tmp2059 = getelementptr inbounds float* %tmp2058, i64 1
+  %tmp2060 = getelementptr inbounds float* %tmp2059, i64 1
+  %tmp2061 = getelementptr inbounds float* %tmp2060, i64 1
+  %tmp2062 = getelementptr inbounds float* %tmp2061, i64 1
+  %tmp2063 = getelementptr inbounds float* %tmp2062, i64 1
+  %tmp2064 = getelementptr inbounds float* %tmp2063, i64 1
+  %tmp2065 = getelementptr inbounds float* %tmp2064, i64 1
+  %tmp2066 = getelementptr inbounds float* %tmp2065, i64 1
+  %tmp2067 = getelementptr inbounds float* %tmp2066, i64 1
+  %tmp2068 = getelementptr inbounds float* %tmp2067, i64 1
+  %tmp2069 = getelementptr inbounds float* %tmp2068, i64 1
+  %tmp2070 = getelementptr inbounds float* %tmp2069, i64 1
+  %tmp2071 = getelementptr inbounds float* %tmp2070, i64 1
+  %tmp2072 = getelementptr inbounds float* %tmp2071, i64 1
+  %tmp2073 = getelementptr inbounds float* %tmp2072, i64 1
+  %tmp2074 = getelementptr inbounds float* %tmp2073, i64 1
+  %tmp2075 = getelementptr inbounds float* %tmp2074, i64 1
+  %tmp2076 = getelementptr inbounds float* %tmp2075, i64 1
+  %tmp2077 = getelementptr inbounds float* %tmp2076, i64 1
+  %tmp2078 = getelementptr inbounds float* %tmp2077, i64 1
+  %tmp2079 = getelementptr inbounds float* %tmp2078, i64 1
+  %tmp2080 = getelementptr inbounds float* %tmp2079, i64 1
+  %tmp2081 = getelementptr inbounds float* %tmp2080, i64 1
+  %tmp2082 = getelementptr inbounds float* %tmp2081, i64 1
+  %tmp2083 = getelementptr inbounds float* %tmp2082, i64 1
+  %tmp2084 = getelementptr inbounds float* %tmp2083, i64 1
+  %tmp2085 = getelementptr inbounds float* %tmp2084, i64 1
+  %tmp2086 = getelementptr inbounds float* %tmp2085, i64 1
+  %tmp2087 = getelementptr inbounds float* %tmp2086, i64 1
+  %tmp2088 = getelementptr inbounds float* %tmp2087, i64 1
+  %tmp2089 = getelementptr inbounds float* %tmp2088, i64 1
+  %tmp2090 = getelementptr inbounds float* %tmp2089, i64 1
+  %tmp2091 = getelementptr inbounds float* %tmp2090, i64 1
+  %tmp2092 = getelementptr inbounds float* %tmp2091, i64 1
+  %tmp2093 = getelementptr inbounds float* %tmp2092, i64 1
+  %tmp2094 = getelementptr inbounds float* %tmp2093, i64 1
+  %tmp2095 = getelementptr inbounds float* %tmp2094, i64 1
+  %tmp2096 = getelementptr inbounds float* %tmp2095, i64 1
+  %tmp2097 = getelementptr inbounds float* %tmp2096, i64 1
+  %tmp2098 = getelementptr inbounds float* %tmp2097, i64 1
+  %tmp2099 = getelementptr inbounds float* %tmp2098, i64 1
+  %tmp2100 = getelementptr inbounds float* %tmp2099, i64 1
+  %tmp2101 = getelementptr inbounds float* %tmp2100, i64 1
+  %tmp2102 = getelementptr inbounds float* %tmp2101, i64 1
+  %tmp2103 = getelementptr inbounds float* %tmp2102, i64 1
+  %tmp2104 = getelementptr inbounds float* %tmp2103, i64 1
+  %tmp2105 = getelementptr inbounds float* %tmp2104, i64 1
+  %tmp2106 = getelementptr inbounds float* %tmp2105, i64 1
+  %tmp2107 = getelementptr inbounds float* %tmp2106, i64 1
+  %tmp2108 = getelementptr inbounds float* %tmp2107, i64 1
+  %tmp2109 = getelementptr inbounds float* %tmp2108, i64 1
+  %tmp2110 = getelementptr inbounds float* %tmp2109, i64 1
+  %tmp2111 = getelementptr inbounds float* %tmp2110, i64 1
+  %tmp2112 = getelementptr inbounds float* %tmp2111, i64 1
+  %tmp2113 = getelementptr inbounds float* %tmp2112, i64 1
+  %tmp2114 = getelementptr inbounds float* %tmp2113, i64 1
+  %tmp2115 = getelementptr inbounds float* %tmp2114, i64 1
+  %tmp2116 = getelementptr inbounds float* %tmp2115, i64 1
+  %tmp2117 = getelementptr inbounds float* %tmp2116, i64 1
+  %tmp2118 = getelementptr inbounds float* %tmp2117, i64 1
+  %tmp2119 = getelementptr inbounds float* %tmp2118, i64 1
+  %tmp2120 = getelementptr inbounds float* %tmp2119, i64 1
+  %tmp2121 = getelementptr inbounds float* %tmp2120, i64 1
+  %tmp2122 = getelementptr inbounds float* %tmp2121, i64 1
+  %tmp2123 = getelementptr inbounds float* %tmp2122, i64 1
+  %tmp2124 = getelementptr inbounds float* %tmp2123, i64 1
+  %tmp2125 = getelementptr inbounds float* %tmp2124, i64 1
+  %tmp2126 = getelementptr inbounds float* %tmp2125, i64 1
+  %tmp2127 = getelementptr inbounds float* %tmp2126, i64 1
+  %tmp2128 = getelementptr inbounds float* %tmp2127, i64 1
+  %tmp2129 = getelementptr inbounds float* %tmp2128, i64 1
+  %tmp2130 = getelementptr inbounds float* %tmp2129, i64 1
+  %tmp2131 = getelementptr inbounds float* %tmp2130, i64 1
+  %tmp2132 = getelementptr inbounds float* %tmp2131, i64 1
+  %tmp2133 = getelementptr inbounds float* %tmp2132, i64 1
+  %tmp2134 = getelementptr inbounds float* %tmp2133, i64 1
+  %tmp2135 = getelementptr inbounds float* %tmp2134, i64 1
+  %tmp2136 = getelementptr inbounds float* %tmp2135, i64 1
+  %tmp2137 = getelementptr inbounds float* %tmp2136, i64 1
+  %tmp2138 = getelementptr inbounds float* %tmp2137, i64 1
+  %tmp2139 = getelementptr inbounds float* %tmp2138, i64 1
+  %tmp2140 = getelementptr inbounds float* %tmp2139, i64 1
+  %tmp2141 = getelementptr inbounds float* %tmp2140, i64 1
+  %tmp2142 = getelementptr inbounds float* %tmp2141, i64 1
+  %tmp2143 = getelementptr inbounds float* %tmp2142, i64 1
+  %tmp2144 = getelementptr inbounds float* %tmp2143, i64 1
+  %tmp2145 = getelementptr inbounds float* %tmp2144, i64 1
+  %tmp2146 = getelementptr inbounds float* %tmp2145, i64 1
+  %tmp2147 = getelementptr inbounds float* %tmp2146, i64 1
+  %tmp2148 = getelementptr inbounds float* %tmp2147, i64 1
+  %tmp2149 = getelementptr inbounds float* %tmp2148, i64 1
+  %tmp2150 = getelementptr inbounds float* %tmp2149, i64 1
+  %tmp2151 = getelementptr inbounds float* %tmp2150, i64 1
+  %tmp2152 = getelementptr inbounds float* %tmp2151, i64 1
+  %tmp2153 = getelementptr inbounds float* %tmp2152, i64 1
+  %tmp2154 = getelementptr inbounds float* %tmp2153, i64 1
+  %tmp2155 = getelementptr inbounds float* %tmp2154, i64 1
+  %tmp2156 = getelementptr inbounds float* %tmp2155, i64 1
+  %tmp2157 = getelementptr inbounds float* %tmp2156, i64 1
+  %tmp2158 = getelementptr inbounds float* %tmp2157, i64 1
+  %tmp2159 = getelementptr inbounds float* %tmp2158, i64 1
+  %tmp2160 = getelementptr inbounds float* %tmp2159, i64 1
+  %tmp2161 = getelementptr inbounds float* %tmp2160, i64 1
+  %tmp2162 = getelementptr inbounds float* %tmp2161, i64 1
+  %tmp2163 = getelementptr inbounds float* %tmp2162, i64 1
+  %tmp2164 = getelementptr inbounds float* %tmp2163, i64 1
+  %tmp2165 = getelementptr inbounds float* %tmp2164, i64 1
+  %tmp2166 = getelementptr inbounds float* %tmp2165, i64 1
+  %tmp2167 = getelementptr inbounds float* %tmp2166, i64 1
+  %tmp2168 = getelementptr inbounds float* %tmp2167, i64 1
+  %tmp2169 = getelementptr inbounds float* %tmp2168, i64 1
+  %tmp2170 = getelementptr inbounds float* %tmp2169, i64 1
+  %tmp2171 = getelementptr inbounds float* %tmp2170, i64 1
+  %tmp2172 = getelementptr inbounds float* %tmp2171, i64 1
+  %tmp2173 = getelementptr inbounds float* %tmp2172, i64 1
+  %tmp2174 = getelementptr inbounds float* %tmp2173, i64 1
+  %tmp2175 = getelementptr inbounds float* %tmp2174, i64 1
+  %tmp2176 = getelementptr inbounds float* %tmp2175, i64 1
+  %tmp2177 = getelementptr inbounds float* %tmp2176, i64 1
+  %tmp2178 = getelementptr inbounds float* %tmp2177, i64 1
+  %tmp2179 = getelementptr inbounds float* %tmp2178, i64 1
+  %tmp2180 = getelementptr inbounds float* %tmp2179, i64 1
+  %tmp2181 = getelementptr inbounds float* %tmp2180, i64 1
+  %tmp2182 = getelementptr inbounds float* %tmp2181, i64 1
+  %tmp2183 = getelementptr inbounds float* %tmp2182, i64 1
+  %tmp2184 = getelementptr inbounds float* %tmp2183, i64 1
+  %tmp2185 = getelementptr inbounds float* %tmp2184, i64 1
+  %tmp2186 = getelementptr inbounds float* %tmp2185, i64 1
+  %tmp2187 = getelementptr inbounds float* %tmp2186, i64 1
+  %tmp2188 = getelementptr inbounds float* %tmp2187, i64 1
+  %tmp2189 = getelementptr inbounds float* %tmp2188, i64 1
+  %tmp2190 = getelementptr inbounds float* %tmp2189, i64 1
+  %tmp2191 = getelementptr inbounds float* %tmp2190, i64 1
+  %tmp2192 = getelementptr inbounds float* %tmp2191, i64 1
+  %tmp2193 = getelementptr inbounds float* %tmp2192, i64 1
+  %tmp2194 = getelementptr inbounds float* %tmp2193, i64 1
+  %tmp2195 = getelementptr inbounds float* %tmp2194, i64 1
+  %tmp2196 = getelementptr inbounds float* %tmp2195, i64 1
+  %tmp2197 = getelementptr inbounds float* %tmp2196, i64 1
+  %tmp2198 = getelementptr inbounds float* %tmp2197, i64 1
+  %tmp2199 = getelementptr inbounds float* %tmp2198, i64 1
+  %tmp2200 = getelementptr inbounds float* %tmp2199, i64 1
+  %tmp2201 = getelementptr inbounds float* %tmp2200, i64 1
+  %tmp2202 = getelementptr inbounds float* %tmp2201, i64 1
+  %tmp2203 = getelementptr inbounds float* %tmp2202, i64 1
+  %tmp2204 = getelementptr inbounds float* %tmp2203, i64 1
+  %tmp2205 = getelementptr inbounds float* %tmp2204, i64 1
+  %tmp2206 = getelementptr inbounds float* %tmp2205, i64 1
+  %tmp2207 = getelementptr inbounds float* %tmp2206, i64 1
+  %tmp2208 = getelementptr inbounds float* %tmp2207, i64 1
+  %tmp2209 = getelementptr inbounds float* %tmp2208, i64 1
+  %tmp2210 = getelementptr inbounds float* %tmp2209, i64 1
+  %tmp2211 = getelementptr inbounds float* %tmp2210, i64 1
+  %tmp2212 = getelementptr inbounds float* %tmp2211, i64 1
+  %tmp2213 = getelementptr inbounds float* %tmp2212, i64 1
+  %tmp2214 = getelementptr inbounds float* %tmp2213, i64 1
+  %tmp2215 = getelementptr inbounds float* %tmp2214, i64 1
+  %tmp2216 = getelementptr inbounds float* %tmp2215, i64 1
+  %tmp2217 = getelementptr inbounds float* %tmp2216, i64 1
+  %tmp2218 = getelementptr inbounds float* %tmp2217, i64 1
+  %tmp2219 = getelementptr inbounds float* %tmp2218, i64 1
+  %tmp2220 = getelementptr inbounds float* %tmp2219, i64 1
+  %tmp2221 = getelementptr inbounds float* %tmp2220, i64 1
+  %tmp2222 = getelementptr inbounds float* %tmp2221, i64 1
+  %tmp2223 = getelementptr inbounds float* %tmp2222, i64 1
+  %tmp2224 = getelementptr inbounds float* %tmp2223, i64 1
+  %tmp2225 = getelementptr inbounds float* %tmp2224, i64 1
+  %tmp2226 = getelementptr inbounds float* %tmp2225, i64 1
+  %tmp2227 = getelementptr inbounds float* %tmp2226, i64 1
+  %tmp2228 = getelementptr inbounds float* %tmp2227, i64 1
+  %tmp2229 = getelementptr inbounds float* %tmp2228, i64 1
+  %tmp2230 = getelementptr inbounds float* %tmp2229, i64 1
+  %tmp2231 = getelementptr inbounds float* %tmp2230, i64 1
+  %tmp2232 = getelementptr inbounds float* %tmp2231, i64 1
+  %tmp2233 = getelementptr inbounds float* %tmp2232, i64 1
+  %tmp2234 = getelementptr inbounds float* %tmp2233, i64 1
+  %tmp2235 = getelementptr inbounds float* %tmp2234, i64 1
+  %tmp2236 = getelementptr inbounds float* %tmp2235, i64 1
+  %tmp2237 = getelementptr inbounds float* %tmp2236, i64 1
+  %tmp2238 = getelementptr inbounds float* %tmp2237, i64 1
+  %tmp2239 = getelementptr inbounds float* %tmp2238, i64 1
+  %tmp2240 = getelementptr inbounds float* %tmp2239, i64 1
+  %tmp2241 = getelementptr inbounds float* %tmp2240, i64 1
+  %tmp2242 = getelementptr inbounds float* %tmp2241, i64 1
+  %tmp2243 = getelementptr inbounds float* %tmp2242, i64 1
+  %tmp2244 = getelementptr inbounds float* %tmp2243, i64 1
+  %tmp2245 = getelementptr inbounds float* %tmp2244, i64 1
+  %tmp2246 = getelementptr inbounds float* %tmp2245, i64 1
+  %tmp2247 = getelementptr inbounds float* %tmp2246, i64 1
+  %tmp2248 = getelementptr inbounds float* %tmp2247, i64 1
+  %tmp2249 = getelementptr inbounds float* %tmp2248, i64 1
+  %tmp2250 = getelementptr inbounds float* %tmp2249, i64 1
+  %tmp2251 = getelementptr inbounds float* %tmp2250, i64 1
+  %tmp2252 = getelementptr inbounds float* %tmp2251, i64 1
+  %tmp2253 = getelementptr inbounds float* %tmp2252, i64 1
+  %tmp2254 = getelementptr inbounds float* %tmp2253, i64 1
+  %tmp2255 = getelementptr inbounds float* %tmp2254, i64 1
+  %tmp2256 = getelementptr inbounds float* %tmp2255, i64 1
+  %tmp2257 = getelementptr inbounds float* %tmp2256, i64 1
+  %tmp2258 = getelementptr inbounds float* %tmp2257, i64 1
+  %tmp2259 = getelementptr inbounds float* %tmp2258, i64 1
+  %tmp2260 = getelementptr inbounds float* %tmp2259, i64 1
+  %tmp2261 = getelementptr inbounds float* %tmp2260, i64 1
+  %tmp2262 = getelementptr inbounds float* %tmp2261, i64 1
+  %tmp2263 = getelementptr inbounds float* %tmp2262, i64 1
+  %tmp2264 = getelementptr inbounds float* %tmp2263, i64 1
+  %tmp2265 = getelementptr inbounds float* %tmp2264, i64 1
+  %tmp2266 = getelementptr inbounds float* %tmp2265, i64 1
+  %tmp2267 = getelementptr inbounds float* %tmp2266, i64 1
+  %tmp2268 = getelementptr inbounds float* %tmp2267, i64 1
+  %tmp2269 = getelementptr inbounds float* %tmp2268, i64 1
+  %tmp2270 = getelementptr inbounds float* %tmp2269, i64 1
+  %tmp2271 = getelementptr inbounds float* %tmp2270, i64 1
+  %tmp2272 = getelementptr inbounds float* %tmp2271, i64 1
+  %tmp2273 = getelementptr inbounds float* %tmp2272, i64 1
+  %tmp2274 = getelementptr inbounds float* %tmp2273, i64 1
+  %tmp2275 = getelementptr inbounds float* %tmp2274, i64 1
+  %tmp2276 = getelementptr inbounds float* %tmp2275, i64 1
+  %tmp2277 = getelementptr inbounds float* %tmp2276, i64 1
+  %tmp2278 = getelementptr inbounds float* %tmp2277, i64 1
+  %tmp2279 = getelementptr inbounds float* %tmp2278, i64 1
+  %tmp2280 = getelementptr inbounds float* %tmp2279, i64 1
+  %tmp2281 = getelementptr inbounds float* %tmp2280, i64 1
+  %tmp2282 = getelementptr inbounds float* %tmp2281, i64 1
+  %tmp2283 = getelementptr inbounds float* %tmp2282, i64 1
+  %tmp2284 = getelementptr inbounds float* %tmp2283, i64 1
+  %tmp2285 = getelementptr inbounds float* %tmp2284, i64 1
+  %tmp2286 = getelementptr inbounds float* %tmp2285, i64 1
+  %tmp2287 = getelementptr inbounds float* %tmp2286, i64 1
+  %tmp2288 = getelementptr inbounds float* %tmp2287, i64 1
+  %tmp2289 = getelementptr inbounds float* %tmp2288, i64 1
+  %tmp2290 = getelementptr inbounds float* %tmp2289, i64 1
+  %tmp2291 = getelementptr inbounds float* %tmp2290, i64 1
+  %tmp2292 = getelementptr inbounds float* %tmp2291, i64 1
+  %tmp2293 = getelementptr inbounds float* %tmp2292, i64 1
+  %tmp2294 = getelementptr inbounds float* %tmp2293, i64 1
+  %tmp2295 = getelementptr inbounds float* %tmp2294, i64 1
+  %tmp2296 = getelementptr inbounds float* %tmp2295, i64 1
+  %tmp2297 = getelementptr inbounds float* %tmp2296, i64 1
+  %tmp2298 = getelementptr inbounds float* %tmp2297, i64 1
+  %tmp2299 = getelementptr inbounds float* %tmp2298, i64 1
+  %tmp2300 = getelementptr inbounds float* %tmp2299, i64 1
+  %tmp2301 = getelementptr inbounds float* %tmp2300, i64 1
+  %tmp2302 = getelementptr inbounds float* %tmp2301, i64 1
+  %tmp2303 = getelementptr inbounds float* %tmp2302, i64 1
+  %tmp2304 = getelementptr inbounds float* %tmp2303, i64 1
+  %tmp2305 = getelementptr inbounds float* %tmp2304, i64 1
+  %tmp2306 = getelementptr inbounds float* %tmp2305, i64 1
+  %tmp2307 = getelementptr inbounds float* %tmp2306, i64 1
+  %tmp2308 = getelementptr inbounds float* %tmp2307, i64 1
+  %tmp2309 = getelementptr inbounds float* %tmp2308, i64 1
+  %tmp2310 = getelementptr inbounds float* %tmp2309, i64 1
+  %tmp2311 = getelementptr inbounds float* %tmp2310, i64 1
+  %tmp2312 = getelementptr inbounds float* %tmp2311, i64 1
+  %tmp2313 = getelementptr inbounds float* %tmp2312, i64 1
+  %tmp2314 = getelementptr inbounds float* %tmp2313, i64 1
+  %tmp2315 = getelementptr inbounds float* %tmp2314, i64 1
+  %tmp2316 = getelementptr inbounds float* %tmp2315, i64 1
+  %tmp2317 = getelementptr inbounds float* %tmp2316, i64 1
+  %tmp2318 = getelementptr inbounds float* %tmp2317, i64 1
+  %tmp2319 = getelementptr inbounds float* %tmp2318, i64 1
+  %tmp2320 = getelementptr inbounds float* %tmp2319, i64 1
+  %tmp2321 = getelementptr inbounds float* %tmp2320, i64 1
+  %tmp2322 = getelementptr inbounds float* %tmp2321, i64 1
+  %tmp2323 = getelementptr inbounds float* %tmp2322, i64 1
+  %tmp2324 = getelementptr inbounds float* %tmp2323, i64 1
+  %tmp2325 = getelementptr inbounds float* %tmp2324, i64 1
+  %tmp2326 = getelementptr inbounds float* %tmp2325, i64 1
+  %tmp2327 = getelementptr inbounds float* %tmp2326, i64 1
+  %tmp2328 = getelementptr inbounds float* %tmp2327, i64 1
+  %tmp2329 = getelementptr inbounds float* %tmp2328, i64 1
+  %tmp2330 = getelementptr inbounds float* %tmp2329, i64 1
+  %tmp2331 = getelementptr inbounds float* %tmp2330, i64 1
+  %tmp2332 = getelementptr inbounds float* %tmp2331, i64 1
+  %tmp2333 = getelementptr inbounds float* %tmp2332, i64 1
+  %tmp2334 = getelementptr inbounds float* %tmp2333, i64 1
+  %tmp2335 = getelementptr inbounds float* %tmp2334, i64 1
+  %tmp2336 = getelementptr inbounds float* %tmp2335, i64 1
+  %tmp2337 = getelementptr inbounds float* %tmp2336, i64 1
+  %tmp2338 = getelementptr inbounds float* %tmp2337, i64 1
+  %tmp2339 = getelementptr inbounds float* %tmp2338, i64 1
+  %tmp2340 = getelementptr inbounds float* %tmp2339, i64 1
+  %tmp2341 = getelementptr inbounds float* %tmp2340, i64 1
+  %tmp2342 = getelementptr inbounds float* %tmp2341, i64 1
+  %tmp2343 = getelementptr inbounds float* %tmp2342, i64 1
+  %tmp2344 = getelementptr inbounds float* %tmp2343, i64 1
+  %tmp2345 = getelementptr inbounds float* %tmp2344, i64 1
+  %tmp2346 = getelementptr inbounds float* %tmp2345, i64 1
+  %tmp2347 = getelementptr inbounds float* %tmp2346, i64 1
+  %tmp2348 = getelementptr inbounds float* %tmp2347, i64 1
+  %tmp2349 = getelementptr inbounds float* %tmp2348, i64 1
+  %tmp2350 = getelementptr inbounds float* %tmp2349, i64 1
+  %tmp2351 = getelementptr inbounds float* %tmp2350, i64 1
+  %tmp2352 = getelementptr inbounds float* %tmp2351, i64 1
+  %tmp2353 = getelementptr inbounds float* %tmp2352, i64 1
+  %tmp2354 = getelementptr inbounds float* %tmp2353, i64 1
+  %tmp2355 = getelementptr inbounds float* %tmp2354, i64 1
+  %tmp2356 = getelementptr inbounds float* %tmp2355, i64 1
+  %tmp2357 = getelementptr inbounds float* %tmp2356, i64 1
+  %tmp2358 = getelementptr inbounds float* %tmp2357, i64 1
+  %tmp2359 = getelementptr inbounds float* %tmp2358, i64 1
+  %tmp2360 = getelementptr inbounds float* %tmp2359, i64 1
+  %tmp2361 = getelementptr inbounds float* %tmp2360, i64 1
+  %tmp2362 = getelementptr inbounds float* %tmp2361, i64 1
+  %tmp2363 = getelementptr inbounds float* %tmp2362, i64 1
+  %tmp2364 = getelementptr inbounds float* %tmp2363, i64 1
+  %tmp2365 = getelementptr inbounds float* %tmp2364, i64 1
+  %tmp2366 = getelementptr inbounds float* %tmp2365, i64 1
+  %tmp2367 = getelementptr inbounds float* %tmp2366, i64 1
+  %tmp2368 = getelementptr inbounds float* %tmp2367, i64 1
+  %tmp2369 = getelementptr inbounds float* %tmp2368, i64 1
+  %tmp2370 = getelementptr inbounds float* %tmp2369, i64 1
+  %tmp2371 = getelementptr inbounds float* %tmp2370, i64 1
+  %tmp2372 = getelementptr inbounds float* %tmp2371, i64 1
+  %tmp2373 = getelementptr inbounds float* %tmp2372, i64 1
+  %tmp2374 = getelementptr inbounds float* %tmp2373, i64 1
+  %tmp2375 = getelementptr inbounds float* %tmp2374, i64 1
+  %tmp2376 = getelementptr inbounds float* %tmp2375, i64 1
+  %tmp2377 = getelementptr inbounds float* %tmp2376, i64 1
+  %tmp2378 = getelementptr inbounds float* %tmp2377, i64 1
+  %tmp2379 = getelementptr inbounds float* %tmp2378, i64 1
+  %tmp2380 = getelementptr inbounds float* %tmp2379, i64 1
+  %tmp2381 = getelementptr inbounds float* %tmp2380, i64 1
+  %tmp2382 = getelementptr inbounds float* %tmp2381, i64 1
+  %tmp2383 = getelementptr inbounds float* %tmp2382, i64 1
+  %tmp2384 = getelementptr inbounds float* %tmp2383, i64 1
+  %tmp2385 = getelementptr inbounds float* %tmp2384, i64 1
+  %tmp2386 = getelementptr inbounds float* %tmp2385, i64 1
+  %tmp2387 = getelementptr inbounds float* %tmp2386, i64 1
+  %tmp2388 = getelementptr inbounds float* %tmp2387, i64 1
+  %tmp2389 = getelementptr inbounds float* %tmp2388, i64 1
+  %tmp2390 = getelementptr inbounds float* %tmp2389, i64 1
+  %tmp2391 = getelementptr inbounds float* %tmp2390, i64 1
+  %tmp2392 = getelementptr inbounds float* %tmp2391, i64 1
+  %tmp2393 = getelementptr inbounds float* %tmp2392, i64 1
+  %tmp2394 = getelementptr inbounds float* %tmp2393, i64 1
+  %tmp2395 = getelementptr inbounds float* %tmp2394, i64 1
+  %tmp2396 = getelementptr inbounds float* %tmp2395, i64 1
+  %tmp2397 = getelementptr inbounds float* %tmp2396, i64 1
+  %tmp2398 = getelementptr inbounds float* %tmp2397, i64 1
+  %tmp2399 = getelementptr inbounds float* %tmp2398, i64 1
+  %tmp2400 = getelementptr inbounds float* %tmp2399, i64 1
+  %tmp2401 = getelementptr inbounds float* %tmp2400, i64 1
+  %tmp2402 = getelementptr inbounds float* %tmp2401, i64 1
+  %tmp2403 = getelementptr inbounds float* %tmp2402, i64 1
+  %tmp2404 = getelementptr inbounds float* %tmp2403, i64 1
+  %tmp2405 = getelementptr inbounds float* %tmp2404, i64 1
+  %tmp2406 = getelementptr inbounds float* %tmp2405, i64 1
+  %tmp2407 = getelementptr inbounds float* %tmp2406, i64 1
+  %tmp2408 = getelementptr inbounds float* %tmp2407, i64 1
+  %tmp2409 = getelementptr inbounds float* %tmp2408, i64 1
+  %tmp2410 = getelementptr inbounds float* %tmp2409, i64 1
+  %tmp2411 = getelementptr inbounds float* %tmp2410, i64 1
+  %tmp2412 = getelementptr inbounds float* %tmp2411, i64 1
+  %tmp2413 = getelementptr inbounds float* %tmp2412, i64 1
+  %tmp2414 = getelementptr inbounds float* %tmp2413, i64 1
+  %tmp2415 = getelementptr inbounds float* %tmp2414, i64 1
+  %tmp2416 = getelementptr inbounds float* %tmp2415, i64 1
+  %tmp2417 = getelementptr inbounds float* %tmp2416, i64 1
+  %tmp2418 = getelementptr inbounds float* %tmp2417, i64 1
+  %tmp2419 = getelementptr inbounds float* %tmp2418, i64 1
+  %tmp2420 = getelementptr inbounds float* %tmp2419, i64 1
+  %tmp2421 = getelementptr inbounds float* %tmp2420, i64 1
+  %tmp2422 = getelementptr inbounds float* %tmp2421, i64 1
+  %tmp2423 = getelementptr inbounds float* %tmp2422, i64 1
+  %tmp2424 = getelementptr inbounds float* %tmp2423, i64 1
+  %tmp2425 = getelementptr inbounds float* %tmp2424, i64 1
+  %tmp2426 = getelementptr inbounds float* %tmp2425, i64 1
+  %tmp2427 = getelementptr inbounds float* %tmp2426, i64 1
+  %tmp2428 = getelementptr inbounds float* %tmp2427, i64 1
+  %tmp2429 = getelementptr inbounds float* %tmp2428, i64 1
+  %tmp2430 = getelementptr inbounds float* %tmp2429, i64 1
+  %tmp2431 = getelementptr inbounds float* %tmp2430, i64 1
+  %tmp2432 = getelementptr inbounds float* %tmp2431, i64 1
+  %tmp2433 = getelementptr inbounds float* %tmp2432, i64 1
+  %tmp2434 = getelementptr inbounds float* %tmp2433, i64 1
+  %tmp2435 = getelementptr inbounds float* %tmp2434, i64 1
+  %tmp2436 = getelementptr inbounds float* %tmp2435, i64 1
+  %tmp2437 = getelementptr inbounds float* %tmp2436, i64 1
+  %tmp2438 = getelementptr inbounds float* %tmp2437, i64 1
+  %tmp2439 = getelementptr inbounds float* %tmp2438, i64 1
+  %tmp2440 = getelementptr inbounds float* %tmp2439, i64 1
+  %tmp2441 = getelementptr inbounds float* %tmp2440, i64 1
+  %tmp2442 = getelementptr inbounds float* %tmp2441, i64 1
+  %tmp2443 = getelementptr inbounds float* %tmp2442, i64 1
+  %tmp2444 = getelementptr inbounds float* %tmp2443, i64 1
+  %tmp2445 = getelementptr inbounds float* %tmp2444, i64 1
+  %tmp2446 = getelementptr inbounds float* %tmp2445, i64 1
+  %tmp2447 = getelementptr inbounds float* %tmp2446, i64 1
+  %tmp2448 = getelementptr inbounds float* %tmp2447, i64 1
+  %tmp2449 = getelementptr inbounds float* %tmp2448, i64 1
+  %tmp2450 = getelementptr inbounds float* %tmp2449, i64 1
+  %tmp2451 = getelementptr inbounds float* %tmp2450, i64 1
+  %tmp2452 = getelementptr inbounds float* %tmp2451, i64 1
+  %tmp2453 = getelementptr inbounds float* %tmp2452, i64 1
+  %tmp2454 = getelementptr inbounds float* %tmp2453, i64 1
+  %tmp2455 = getelementptr inbounds float* %tmp2454, i64 1
+  %tmp2456 = getelementptr inbounds float* %tmp2455, i64 1
+  %tmp2457 = getelementptr inbounds float* %tmp2456, i64 1
+  %tmp2458 = getelementptr inbounds float* %tmp2457, i64 1
+  %tmp2459 = getelementptr inbounds float* %tmp2458, i64 1
+  %tmp2460 = getelementptr inbounds float* %tmp2459, i64 1
+  %tmp2461 = getelementptr inbounds float* %tmp2460, i64 1
+  %tmp2462 = getelementptr inbounds float* %tmp2461, i64 1
+  %tmp2463 = getelementptr inbounds float* %tmp2462, i64 1
+  %tmp2464 = getelementptr inbounds float* %tmp2463, i64 1
+  %tmp2465 = getelementptr inbounds float* %tmp2464, i64 1
+  %tmp2466 = getelementptr inbounds float* %tmp2465, i64 1
+  %tmp2467 = getelementptr inbounds float* %tmp2466, i64 1
+  %tmp2468 = getelementptr inbounds float* %tmp2467, i64 1
+  %tmp2469 = getelementptr inbounds float* %tmp2468, i64 1
+  %tmp2470 = getelementptr inbounds float* %tmp2469, i64 1
+  %tmp2471 = getelementptr inbounds float* %tmp2470, i64 1
+  %tmp2472 = getelementptr inbounds float* %tmp2471, i64 1
+  %tmp2473 = getelementptr inbounds float* %tmp2472, i64 1
+  %tmp2474 = getelementptr inbounds float* %tmp2473, i64 1
+  %tmp2475 = getelementptr inbounds float* %tmp2474, i64 1
+  %tmp2476 = getelementptr inbounds float* %tmp2475, i64 1
+  %tmp2477 = getelementptr inbounds float* %tmp2476, i64 1
+  %tmp2478 = getelementptr inbounds float* %tmp2477, i64 1
+  %tmp2479 = getelementptr inbounds float* %tmp2478, i64 1
+  %tmp2480 = getelementptr inbounds float* %tmp2479, i64 1
+  %tmp2481 = getelementptr inbounds float* %tmp2480, i64 1
+  %tmp2482 = getelementptr inbounds float* %tmp2481, i64 1
+  %tmp2483 = getelementptr inbounds float* %tmp2482, i64 1
+  %tmp2484 = getelementptr inbounds float* %tmp2483, i64 1
+  %tmp2485 = getelementptr inbounds float* %tmp2484, i64 1
+  %tmp2486 = getelementptr inbounds float* %tmp2485, i64 1
+  %tmp2487 = getelementptr inbounds float* %tmp2486, i64 1
+  %tmp2488 = getelementptr inbounds float* %tmp2487, i64 1
+  %tmp2489 = getelementptr inbounds float* %tmp2488, i64 1
+  %tmp2490 = getelementptr inbounds float* %tmp2489, i64 1
+  %tmp2491 = getelementptr inbounds float* %tmp2490, i64 1
+  %tmp2492 = getelementptr inbounds float* %tmp2491, i64 1
+  %tmp2493 = getelementptr inbounds float* %tmp2492, i64 1
+  %tmp2494 = getelementptr inbounds float* %tmp2493, i64 1
+  %tmp2495 = getelementptr inbounds float* %tmp2494, i64 1
+  %tmp2496 = getelementptr inbounds float* %tmp2495, i64 1
+  %tmp2497 = getelementptr inbounds float* %tmp2496, i64 1
+  %tmp2498 = getelementptr inbounds float* %tmp2497, i64 1
+  %tmp2499 = getelementptr inbounds float* %tmp2498, i64 1
+  %tmp2500 = getelementptr inbounds float* %tmp2499, i64 1
+  %tmp2501 = getelementptr inbounds float* %tmp2500, i64 1
+  %tmp2502 = getelementptr inbounds float* %tmp2501, i64 1
+  %tmp2503 = getelementptr inbounds float* %tmp2502, i64 1
+  %tmp2504 = getelementptr inbounds float* %tmp2503, i64 1
+  %tmp2505 = getelementptr inbounds float* %tmp2504, i64 1
+  %tmp2506 = getelementptr inbounds float* %tmp2505, i64 1
+  %tmp2507 = getelementptr inbounds float* %tmp2506, i64 1
+  %tmp2508 = getelementptr inbounds float* %tmp2507, i64 1
+  %tmp2509 = getelementptr inbounds float* %tmp2508, i64 1
+  %tmp2510 = getelementptr inbounds float* %tmp2509, i64 1
+  %tmp2511 = getelementptr inbounds float* %tmp2510, i64 1
+  %tmp2512 = getelementptr inbounds float* %tmp2511, i64 1
+  %tmp2513 = getelementptr inbounds float* %tmp2512, i64 1
+  %tmp2514 = getelementptr inbounds float* %tmp2513, i64 1
+  %tmp2515 = getelementptr inbounds float* %tmp2514, i64 1
+  %tmp2516 = getelementptr inbounds float* %tmp2515, i64 1
+  %tmp2517 = getelementptr inbounds float* %tmp2516, i64 1
+  %tmp2518 = getelementptr inbounds float* %tmp2517, i64 1
+  %tmp2519 = getelementptr inbounds float* %tmp2518, i64 1
+  %tmp2520 = getelementptr inbounds float* %tmp2519, i64 1
+  %tmp2521 = getelementptr inbounds float* %tmp2520, i64 1
+  %tmp2522 = getelementptr inbounds float* %tmp2521, i64 1
+  %tmp2523 = getelementptr inbounds float* %tmp2522, i64 1
+  %tmp2524 = getelementptr inbounds float* %tmp2523, i64 1
+  %tmp2525 = getelementptr inbounds float* %tmp2524, i64 1
+  %tmp2526 = getelementptr inbounds float* %tmp2525, i64 1
+  %tmp2527 = getelementptr inbounds float* %tmp2526, i64 1
+  %tmp2528 = getelementptr inbounds float* %tmp2527, i64 1
+  %tmp2529 = getelementptr inbounds float* %tmp2528, i64 1
+  %tmp2530 = getelementptr inbounds float* %tmp2529, i64 1
+  %tmp2531 = getelementptr inbounds float* %tmp2530, i64 1
+  %tmp2532 = getelementptr inbounds float* %tmp2531, i64 1
+  %tmp2533 = getelementptr inbounds float* %tmp2532, i64 1
+  %tmp2534 = getelementptr inbounds float* %tmp2533, i64 1
+  %tmp2535 = getelementptr inbounds float* %tmp2534, i64 1
+  %tmp2536 = getelementptr inbounds float* %tmp2535, i64 1
+  %tmp2537 = getelementptr inbounds float* %tmp2536, i64 1
+  %tmp2538 = getelementptr inbounds float* %tmp2537, i64 1
+  %tmp2539 = getelementptr inbounds float* %tmp2538, i64 1
+  %tmp2540 = getelementptr inbounds float* %tmp2539, i64 1
+  %tmp2541 = getelementptr inbounds float* %tmp2540, i64 1
+  %tmp2542 = getelementptr inbounds float* %tmp2541, i64 1
+  %tmp2543 = getelementptr inbounds float* %tmp2542, i64 1
+  %tmp2544 = getelementptr inbounds float* %tmp2543, i64 1
+  %tmp2545 = getelementptr inbounds float* %tmp2544, i64 1
+  %tmp2546 = getelementptr inbounds float* %tmp2545, i64 1
+  %tmp2547 = getelementptr inbounds float* %tmp2546, i64 1
+  %tmp2548 = getelementptr inbounds float* %tmp2547, i64 1
+  %tmp2549 = getelementptr inbounds float* %tmp2548, i64 1
+  %tmp2550 = getelementptr inbounds float* %tmp2549, i64 1
+  %tmp2551 = getelementptr inbounds float* %tmp2550, i64 1
+  %tmp2552 = getelementptr inbounds float* %tmp2551, i64 1
+  %tmp2553 = getelementptr inbounds float* %tmp2552, i64 1
+  %tmp2554 = getelementptr inbounds float* %tmp2553, i64 1
+  %tmp2555 = getelementptr inbounds float* %tmp2554, i64 1
+  %tmp2556 = getelementptr inbounds float* %tmp2555, i64 1
+  %tmp2557 = getelementptr inbounds float* %tmp2556, i64 1
+  %tmp2558 = getelementptr inbounds float* %tmp2557, i64 1
+  %tmp2559 = getelementptr inbounds float* %tmp2558, i64 1
+  %tmp2560 = getelementptr inbounds float* %tmp2559, i64 1
+  %tmp2561 = getelementptr inbounds float* %tmp2560, i64 1
+  %tmp2562 = getelementptr inbounds float* %tmp2561, i64 1
+  %tmp2563 = getelementptr inbounds float* %tmp2562, i64 1
+  %tmp2564 = getelementptr inbounds float* %tmp2563, i64 1
+  %tmp2565 = getelementptr inbounds float* %tmp2564, i64 1
+  %tmp2566 = getelementptr inbounds float* %tmp2565, i64 1
+  %tmp2567 = getelementptr inbounds float* %tmp2566, i64 1
+  %tmp2568 = getelementptr inbounds float* %tmp2567, i64 1
+  %tmp2569 = getelementptr inbounds float* %tmp2568, i64 1
+  %tmp2570 = getelementptr inbounds float* %tmp2569, i64 1
+  %tmp2571 = getelementptr inbounds float* %tmp2570, i64 1
+  %tmp2572 = getelementptr inbounds float* %tmp2571, i64 1
+  %tmp2573 = getelementptr inbounds float* %tmp2572, i64 1
+  %tmp2574 = getelementptr inbounds float* %tmp2573, i64 1
+  %tmp2575 = getelementptr inbounds float* %tmp2574, i64 1
+  %tmp2576 = getelementptr inbounds float* %tmp2575, i64 1
+  %tmp2577 = getelementptr inbounds float* %tmp2576, i64 1
+  %tmp2578 = getelementptr inbounds float* %tmp2577, i64 1
+  %tmp2579 = getelementptr inbounds float* %tmp2578, i64 1
+  %tmp2580 = getelementptr inbounds float* %tmp2579, i64 1
+  %tmp2581 = getelementptr inbounds float* %tmp2580, i64 1
+  %tmp2582 = getelementptr inbounds float* %tmp2581, i64 1
+  %tmp2583 = getelementptr inbounds float* %tmp2582, i64 1
+  %tmp2584 = getelementptr inbounds float* %tmp2583, i64 1
+  %tmp2585 = getelementptr inbounds float* %tmp2584, i64 1
+  %tmp2586 = getelementptr inbounds float* %tmp2585, i64 1
+  %tmp2587 = getelementptr inbounds float* %tmp2586, i64 1
+  %tmp2588 = getelementptr inbounds float* %tmp2587, i64 1
+  %tmp2589 = getelementptr inbounds float* %tmp2588, i64 1
+  %tmp2590 = getelementptr inbounds float* %tmp2589, i64 1
+  %tmp2591 = getelementptr inbounds float* %tmp2590, i64 1
+  %tmp2592 = getelementptr inbounds float* %tmp2591, i64 1
+  %tmp2593 = getelementptr inbounds float* %tmp2592, i64 1
+  %tmp2594 = getelementptr inbounds float* %tmp2593, i64 1
+  %tmp2595 = getelementptr inbounds float* %tmp2594, i64 1
+  %tmp2596 = getelementptr inbounds float* %tmp2595, i64 1
+  %tmp2597 = getelementptr inbounds float* %tmp2596, i64 1
+  %tmp2598 = getelementptr inbounds float* %tmp2597, i64 1
+  %tmp2599 = getelementptr inbounds float* %tmp2598, i64 1
+  %tmp2600 = getelementptr inbounds float* %tmp2599, i64 1
+  %tmp2601 = getelementptr inbounds float* %tmp2600, i64 1
+  %tmp2602 = getelementptr inbounds float* %tmp2601, i64 1
+  %tmp2603 = getelementptr inbounds float* %tmp2602, i64 1
+  %tmp2604 = getelementptr inbounds float* %tmp2603, i64 1
+  %tmp2605 = getelementptr inbounds float* %tmp2604, i64 1
+  %tmp2606 = getelementptr inbounds float* %tmp2605, i64 1
+  %tmp2607 = getelementptr inbounds float* %tmp2606, i64 1
+  %tmp2608 = getelementptr inbounds float* %tmp2607, i64 1
+  %tmp2609 = getelementptr inbounds float* %tmp2608, i64 1
+  %tmp2610 = getelementptr inbounds float* %tmp2609, i64 1
+  %tmp2611 = getelementptr inbounds float* %tmp2610, i64 1
+  %tmp2612 = getelementptr inbounds float* %tmp2611, i64 1
+  %tmp2613 = getelementptr inbounds float* %tmp2612, i64 1
+  %tmp2614 = getelementptr inbounds float* %tmp2613, i64 1
+  %tmp2615 = getelementptr inbounds float* %tmp2614, i64 1
+  %tmp2616 = getelementptr inbounds float* %tmp2615, i64 1
+  %tmp2617 = getelementptr inbounds float* %tmp2616, i64 1
+  %tmp2618 = getelementptr inbounds float* %tmp2617, i64 1
+  %tmp2619 = getelementptr inbounds float* %tmp2618, i64 1
+  %tmp2620 = getelementptr inbounds float* %tmp2619, i64 1
+  %tmp2621 = getelementptr inbounds float* %tmp2620, i64 1
+  %tmp2622 = getelementptr inbounds float* %tmp2621, i64 1
+  %tmp2623 = getelementptr inbounds float* %tmp2622, i64 1
+  %tmp2624 = getelementptr inbounds float* %tmp2623, i64 1
+  %tmp2625 = getelementptr inbounds float* %tmp2624, i64 1
+  %tmp2626 = getelementptr inbounds float* %tmp2625, i64 1
+  %tmp2627 = getelementptr inbounds float* %tmp2626, i64 1
+  %tmp2628 = getelementptr inbounds float* %tmp2627, i64 1
+  %tmp2629 = getelementptr inbounds float* %tmp2628, i64 1
+  %tmp2630 = getelementptr inbounds float* %tmp2629, i64 1
+  %tmp2631 = getelementptr inbounds float* %tmp2630, i64 1
+  %tmp2632 = getelementptr inbounds float* %tmp2631, i64 1
+  %tmp2633 = getelementptr inbounds float* %tmp2632, i64 1
+  %tmp2634 = getelementptr inbounds float* %tmp2633, i64 1
+  %tmp2635 = getelementptr inbounds float* %tmp2634, i64 1
+  %tmp2636 = getelementptr inbounds float* %tmp2635, i64 1
+  %tmp2637 = getelementptr inbounds float* %tmp2636, i64 1
+  %tmp2638 = getelementptr inbounds float* %tmp2637, i64 1
+  %tmp2639 = getelementptr inbounds float* %tmp2638, i64 1
+  %tmp2640 = getelementptr inbounds float* %tmp2639, i64 1
+  %tmp2641 = getelementptr inbounds float* %tmp2640, i64 1
+  %tmp2642 = getelementptr inbounds float* %tmp2641, i64 1
+  %tmp2643 = getelementptr inbounds float* %tmp2642, i64 1
+  %tmp2644 = getelementptr inbounds float* %tmp2643, i64 1
+  %tmp2645 = getelementptr inbounds float* %tmp2644, i64 1
+  %tmp2646 = getelementptr inbounds float* %tmp2645, i64 1
+  %tmp2647 = getelementptr inbounds float* %tmp2646, i64 1
+  %tmp2648 = getelementptr inbounds float* %tmp2647, i64 1
+  %tmp2649 = getelementptr inbounds float* %tmp2648, i64 1
+  %tmp2650 = getelementptr inbounds float* %tmp2649, i64 1
+  %tmp2651 = getelementptr inbounds float* %tmp2650, i64 1
+  %tmp2652 = getelementptr inbounds float* %tmp2651, i64 1
+  %tmp2653 = getelementptr inbounds float* %tmp2652, i64 1
+  %tmp2654 = getelementptr inbounds float* %tmp2653, i64 1
+  %tmp2655 = getelementptr inbounds float* %tmp2654, i64 1
+  %tmp2656 = getelementptr inbounds float* %tmp2655, i64 1
+  %tmp2657 = getelementptr inbounds float* %tmp2656, i64 1
+  %tmp2658 = getelementptr inbounds float* %tmp2657, i64 1
+  %tmp2659 = getelementptr inbounds float* %tmp2658, i64 1
+  %tmp2660 = getelementptr inbounds float* %tmp2659, i64 1
+  %tmp2661 = getelementptr inbounds float* %tmp2660, i64 1
+  %tmp2662 = getelementptr inbounds float* %tmp2661, i64 1
+  %tmp2663 = getelementptr inbounds float* %tmp2662, i64 1
+  %tmp2664 = getelementptr inbounds float* %tmp2663, i64 1
+  %tmp2665 = getelementptr inbounds float* %tmp2664, i64 1
+  %tmp2666 = getelementptr inbounds float* %tmp2665, i64 1
+  %tmp2667 = getelementptr inbounds float* %tmp2666, i64 1
+  %tmp2668 = getelementptr inbounds float* %tmp2667, i64 1
+  %tmp2669 = getelementptr inbounds float* %tmp2668, i64 1
+  %tmp2670 = getelementptr inbounds float* %tmp2669, i64 1
+  %tmp2671 = getelementptr inbounds float* %tmp2670, i64 1
+  %tmp2672 = getelementptr inbounds float* %tmp2671, i64 1
+  %tmp2673 = getelementptr inbounds float* %tmp2672, i64 1
+  %tmp2674 = getelementptr inbounds float* %tmp2673, i64 1
+  %tmp2675 = getelementptr inbounds float* %tmp2674, i64 1
+  %tmp2676 = getelementptr inbounds float* %tmp2675, i64 1
+  %tmp2677 = getelementptr inbounds float* %tmp2676, i64 1
+  %tmp2678 = getelementptr inbounds float* %tmp2677, i64 1
+  %tmp2679 = getelementptr inbounds float* %tmp2678, i64 1
+  %tmp2680 = getelementptr inbounds float* %tmp2679, i64 1
+  %tmp2681 = getelementptr inbounds float* %tmp2680, i64 1
+  %tmp2682 = getelementptr inbounds float* %tmp2681, i64 1
+  %tmp2683 = getelementptr inbounds float* %tmp2682, i64 1
+  %tmp2684 = getelementptr inbounds float* %tmp2683, i64 1
+  %tmp2685 = getelementptr inbounds float* %tmp2684, i64 1
+  %tmp2686 = getelementptr inbounds float* %tmp2685, i64 1
+  %tmp2687 = getelementptr inbounds float* %tmp2686, i64 1
+  %tmp2688 = getelementptr inbounds float* %tmp2687, i64 1
+  %tmp2689 = getelementptr inbounds float* %tmp2688, i64 1
+  %tmp2690 = getelementptr inbounds float* %tmp2689, i64 1
+  %tmp2691 = getelementptr inbounds float* %tmp2690, i64 1
+  %tmp2692 = getelementptr inbounds float* %tmp2691, i64 1
+  %tmp2693 = getelementptr inbounds float* %tmp2692, i64 1
+  %tmp2694 = getelementptr inbounds float* %tmp2693, i64 1
+  %tmp2695 = getelementptr inbounds float* %tmp2694, i64 1
+  %tmp2696 = getelementptr inbounds float* %tmp2695, i64 1
+  %tmp2697 = getelementptr inbounds float* %tmp2696, i64 1
+  %tmp2698 = getelementptr inbounds float* %tmp2697, i64 1
+  %tmp2699 = getelementptr inbounds float* %tmp2698, i64 1
+  %tmp2700 = getelementptr inbounds float* %tmp2699, i64 1
+  %tmp2701 = getelementptr inbounds float* %tmp2700, i64 1
+  %tmp2702 = getelementptr inbounds float* %tmp2701, i64 1
+  %tmp2703 = getelementptr inbounds float* %tmp2702, i64 1
+  %tmp2704 = getelementptr inbounds float* %tmp2703, i64 1
+  %tmp2705 = getelementptr inbounds float* %tmp2704, i64 1
+  %tmp2706 = getelementptr inbounds float* %tmp2705, i64 1
+  %tmp2707 = getelementptr inbounds float* %tmp2706, i64 1
+  %tmp2708 = getelementptr inbounds float* %tmp2707, i64 1
+  %tmp2709 = getelementptr inbounds float* %tmp2708, i64 1
+  %tmp2710 = getelementptr inbounds float* %tmp2709, i64 1
+  %tmp2711 = getelementptr inbounds float* %tmp2710, i64 1
+  %tmp2712 = getelementptr inbounds float* %tmp2711, i64 1
+  %tmp2713 = getelementptr inbounds float* %tmp2712, i64 1
+  %tmp2714 = getelementptr inbounds float* %tmp2713, i64 1
+  %tmp2715 = getelementptr inbounds float* %tmp2714, i64 1
+  %tmp2716 = getelementptr inbounds float* %tmp2715, i64 1
+  %tmp2717 = getelementptr inbounds float* %tmp2716, i64 1
+  %tmp2718 = getelementptr inbounds float* %tmp2717, i64 1
+  %tmp2719 = getelementptr inbounds float* %tmp2718, i64 1
+  %tmp2720 = getelementptr inbounds float* %tmp2719, i64 1
+  %tmp2721 = getelementptr inbounds float* %tmp2720, i64 1
+  %tmp2722 = getelementptr inbounds float* %tmp2721, i64 1
+  %tmp2723 = getelementptr inbounds float* %tmp2722, i64 1
+  %tmp2724 = getelementptr inbounds float* %tmp2723, i64 1
+  %tmp2725 = getelementptr inbounds float* %tmp2724, i64 1
+  %tmp2726 = getelementptr inbounds float* %tmp2725, i64 1
+  %tmp2727 = getelementptr inbounds float* %tmp2726, i64 1
+  %tmp2728 = getelementptr inbounds float* %tmp2727, i64 1
+  %tmp2729 = getelementptr inbounds float* %tmp2728, i64 1
+  %tmp2730 = getelementptr inbounds float* %tmp2729, i64 1
+  %tmp2731 = getelementptr inbounds float* %tmp2730, i64 1
+  %tmp2732 = getelementptr inbounds float* %tmp2731, i64 1
+  %tmp2733 = getelementptr inbounds float* %tmp2732, i64 1
+  %tmp2734 = getelementptr inbounds float* %tmp2733, i64 1
+  %tmp2735 = getelementptr inbounds float* %tmp2734, i64 1
+  %tmp2736 = getelementptr inbounds float* %tmp2735, i64 1
+  %tmp2737 = getelementptr inbounds float* %tmp2736, i64 1
+  %tmp2738 = getelementptr inbounds float* %tmp2737, i64 1
+  %tmp2739 = getelementptr inbounds float* %tmp2738, i64 1
+  %tmp2740 = getelementptr inbounds float* %tmp2739, i64 1
+  %tmp2741 = getelementptr inbounds float* %tmp2740, i64 1
+  %tmp2742 = getelementptr inbounds float* %tmp2741, i64 1
+  %tmp2743 = getelementptr inbounds float* %tmp2742, i64 1
+  %tmp2744 = getelementptr inbounds float* %tmp2743, i64 1
+  %tmp2745 = getelementptr inbounds float* %tmp2744, i64 1
+  %tmp2746 = getelementptr inbounds float* %tmp2745, i64 1
+  %tmp2747 = getelementptr inbounds float* %tmp2746, i64 1
+  %tmp2748 = getelementptr inbounds float* %tmp2747, i64 1
+  %tmp2749 = getelementptr inbounds float* %tmp2748, i64 1
+  %tmp2750 = getelementptr inbounds float* %tmp2749, i64 1
+  %tmp2751 = getelementptr inbounds float* %tmp2750, i64 1
+  %tmp2752 = getelementptr inbounds float* %tmp2751, i64 1
+  %tmp2753 = getelementptr inbounds float* %tmp2752, i64 1
+  %tmp2754 = getelementptr inbounds float* %tmp2753, i64 1
+  %tmp2755 = getelementptr inbounds float* %tmp2754, i64 1
+  %tmp2756 = getelementptr inbounds float* %tmp2755, i64 1
+  %tmp2757 = getelementptr inbounds float* %tmp2756, i64 1
+  %tmp2758 = getelementptr inbounds float* %tmp2757, i64 1
+  %tmp2759 = getelementptr inbounds float* %tmp2758, i64 1
+  %tmp2760 = getelementptr inbounds float* %tmp2759, i64 1
+  %tmp2761 = getelementptr inbounds float* %tmp2760, i64 1
+  %tmp2762 = getelementptr inbounds float* %tmp2761, i64 1
+  %tmp2763 = getelementptr inbounds float* %tmp2762, i64 1
+  %tmp2764 = getelementptr inbounds float* %tmp2763, i64 1
+  %tmp2765 = getelementptr inbounds float* %tmp2764, i64 1
+  %tmp2766 = getelementptr inbounds float* %tmp2765, i64 1
+  %tmp2767 = getelementptr inbounds float* %tmp2766, i64 1
+  %tmp2768 = getelementptr inbounds float* %tmp2767, i64 1
+  %tmp2769 = getelementptr inbounds float* %tmp2768, i64 1
+  %tmp2770 = getelementptr inbounds float* %tmp2769, i64 1
+  %tmp2771 = getelementptr inbounds float* %tmp2770, i64 1
+  %tmp2772 = getelementptr inbounds float* %tmp2771, i64 1
+  %tmp2773 = getelementptr inbounds float* %tmp2772, i64 1
+  %tmp2774 = getelementptr inbounds float* %tmp2773, i64 1
+  %tmp2775 = getelementptr inbounds float* %tmp2774, i64 1
+  %tmp2776 = getelementptr inbounds float* %tmp2775, i64 1
+  %tmp2777 = getelementptr inbounds float* %tmp2776, i64 1
+  %tmp2778 = getelementptr inbounds float* %tmp2777, i64 1
+  %tmp2779 = getelementptr inbounds float* %tmp2778, i64 1
+  %tmp2780 = getelementptr inbounds float* %tmp2779, i64 1
+  %tmp2781 = getelementptr inbounds float* %tmp2780, i64 1
+  %tmp2782 = getelementptr inbounds float* %tmp2781, i64 1
+  %tmp2783 = getelementptr inbounds float* %tmp2782, i64 1
+  %tmp2784 = getelementptr inbounds float* %tmp2783, i64 1
+  %tmp2785 = getelementptr inbounds float* %tmp2784, i64 1
+  %tmp2786 = getelementptr inbounds float* %tmp2785, i64 1
+  %tmp2787 = getelementptr inbounds float* %tmp2786, i64 1
+  %tmp2788 = getelementptr inbounds float* %tmp2787, i64 1
+  %tmp2789 = getelementptr inbounds float* %tmp2788, i64 1
+  %tmp2790 = getelementptr inbounds float* %tmp2789, i64 1
+  %tmp2791 = getelementptr inbounds float* %tmp2790, i64 1
+  %tmp2792 = getelementptr inbounds float* %tmp2791, i64 1
+  %tmp2793 = getelementptr inbounds float* %tmp2792, i64 1
+  %tmp2794 = getelementptr inbounds float* %tmp2793, i64 1
+  %tmp2795 = getelementptr inbounds float* %tmp2794, i64 1
+  %tmp2796 = getelementptr inbounds float* %tmp2795, i64 1
+  %tmp2797 = getelementptr inbounds float* %tmp2796, i64 1
+  %tmp2798 = getelementptr inbounds float* %tmp2797, i64 1
+  %tmp2799 = getelementptr inbounds float* %tmp2798, i64 1
+  %tmp2800 = getelementptr inbounds float* %tmp2799, i64 1
+  %tmp2801 = getelementptr inbounds float* %tmp2800, i64 1
+  %tmp2802 = getelementptr inbounds float* %tmp2801, i64 1
+  %tmp2803 = getelementptr inbounds float* %tmp2802, i64 1
+  %tmp2804 = getelementptr inbounds float* %tmp2803, i64 1
+  %tmp2805 = getelementptr inbounds float* %tmp2804, i64 1
+  %tmp2806 = getelementptr inbounds float* %tmp2805, i64 1
+  %tmp2807 = getelementptr inbounds float* %tmp2806, i64 1
+  %tmp2808 = getelementptr inbounds float* %tmp2807, i64 1
+  %tmp2809 = getelementptr inbounds float* %tmp2808, i64 1
+  %tmp2810 = getelementptr inbounds float* %tmp2809, i64 1
+  %tmp2811 = getelementptr inbounds float* %tmp2810, i64 1
+  %tmp2812 = getelementptr inbounds float* %tmp2811, i64 1
+  %tmp2813 = getelementptr inbounds float* %tmp2812, i64 1
+  %tmp2814 = getelementptr inbounds float* %tmp2813, i64 1
+  %tmp2815 = getelementptr inbounds float* %tmp2814, i64 1
+  %tmp2816 = getelementptr inbounds float* %tmp2815, i64 1
+  %tmp2817 = getelementptr inbounds float* %tmp2816, i64 1
+  %tmp2818 = getelementptr inbounds float* %tmp2817, i64 1
+  %tmp2819 = getelementptr inbounds float* %tmp2818, i64 1
+  %tmp2820 = getelementptr inbounds float* %tmp2819, i64 1
+  %tmp2821 = getelementptr inbounds float* %tmp2820, i64 1
+  %tmp2822 = getelementptr inbounds float* %tmp2821, i64 1
+  %tmp2823 = getelementptr inbounds float* %tmp2822, i64 1
+  %tmp2824 = getelementptr inbounds float* %tmp2823, i64 1
+  %tmp2825 = getelementptr inbounds float* %tmp2824, i64 1
+  %tmp2826 = getelementptr inbounds float* %tmp2825, i64 1
+  %tmp2827 = getelementptr inbounds float* %tmp2826, i64 1
+  %tmp2828 = getelementptr inbounds float* %tmp2827, i64 1
+  %tmp2829 = getelementptr inbounds float* %tmp2828, i64 1
+  %tmp2830 = getelementptr inbounds float* %tmp2829, i64 1
+  %tmp2831 = getelementptr inbounds float* %tmp2830, i64 1
+  %tmp2832 = getelementptr inbounds float* %tmp2831, i64 1
+  %tmp2833 = getelementptr inbounds float* %tmp2832, i64 1
+  %tmp2834 = getelementptr inbounds float* %tmp2833, i64 1
+  %tmp2835 = getelementptr inbounds float* %tmp2834, i64 1
+  %tmp2836 = getelementptr inbounds float* %tmp2835, i64 1
+  %tmp2837 = getelementptr inbounds float* %tmp2836, i64 1
+  %tmp2838 = getelementptr inbounds float* %tmp2837, i64 1
+  %tmp2839 = getelementptr inbounds float* %tmp2838, i64 1
+  %tmp2840 = getelementptr inbounds float* %tmp2839, i64 1
+  %tmp2841 = getelementptr inbounds float* %tmp2840, i64 1
+  %tmp2842 = getelementptr inbounds float* %tmp2841, i64 1
+  %tmp2843 = getelementptr inbounds float* %tmp2842, i64 1
+  %tmp2844 = getelementptr inbounds float* %tmp2843, i64 1
+  %tmp2845 = getelementptr inbounds float* %tmp2844, i64 1
+  %tmp2846 = getelementptr inbounds float* %tmp2845, i64 1
+  %tmp2847 = getelementptr inbounds float* %tmp2846, i64 1
+  %tmp2848 = getelementptr inbounds float* %tmp2847, i64 1
+  %tmp2849 = getelementptr inbounds float* %tmp2848, i64 1
+  %tmp2850 = getelementptr inbounds float* %tmp2849, i64 1
+  %tmp2851 = getelementptr inbounds float* %tmp2850, i64 1
+  %tmp2852 = getelementptr inbounds float* %tmp2851, i64 1
+  %tmp2853 = getelementptr inbounds float* %tmp2852, i64 1
+  %tmp2854 = getelementptr inbounds float* %tmp2853, i64 1
+  %tmp2855 = getelementptr inbounds float* %tmp2854, i64 1
+  %tmp2856 = getelementptr inbounds float* %tmp2855, i64 1
+  %tmp2857 = getelementptr inbounds float* %tmp2856, i64 1
+  %tmp2858 = getelementptr inbounds float* %tmp2857, i64 1
+  %tmp2859 = getelementptr inbounds float* %tmp2858, i64 1
+  %tmp2860 = getelementptr inbounds float* %tmp2859, i64 1
+  %tmp2861 = getelementptr inbounds float* %tmp2860, i64 1
+  %tmp2862 = getelementptr inbounds float* %tmp2861, i64 1
+  %tmp2863 = getelementptr inbounds float* %tmp2862, i64 1
+  %tmp2864 = getelementptr inbounds float* %tmp2863, i64 1
+  %tmp2865 = getelementptr inbounds float* %tmp2864, i64 1
+  %tmp2866 = getelementptr inbounds float* %tmp2865, i64 1
+  %tmp2867 = getelementptr inbounds float* %tmp2866, i64 1
+  %tmp2868 = getelementptr inbounds float* %tmp2867, i64 1
+  %tmp2869 = getelementptr inbounds float* %tmp2868, i64 1
+  %tmp2870 = getelementptr inbounds float* %tmp2869, i64 1
+  %tmp2871 = getelementptr inbounds float* %tmp2870, i64 1
+  %tmp2872 = getelementptr inbounds float* %tmp2871, i64 1
+  %tmp2873 = getelementptr inbounds float* %tmp2872, i64 1
+  %tmp2874 = getelementptr inbounds float* %tmp2873, i64 1
+  %tmp2875 = getelementptr inbounds float* %tmp2874, i64 1
+  %tmp2876 = getelementptr inbounds float* %tmp2875, i64 1
+  %tmp2877 = getelementptr inbounds float* %tmp2876, i64 1
+  %tmp2878 = getelementptr inbounds float* %tmp2877, i64 1
+  %tmp2879 = getelementptr inbounds float* %tmp2878, i64 1
+  %tmp2880 = getelementptr inbounds float* %tmp2879, i64 1
+  %tmp2881 = getelementptr inbounds float* %tmp2880, i64 1
+  %tmp2882 = getelementptr inbounds float* %tmp2881, i64 1
+  %tmp2883 = getelementptr inbounds float* %tmp2882, i64 1
+  %tmp2884 = getelementptr inbounds float* %tmp2883, i64 1
+  %tmp2885 = getelementptr inbounds float* %tmp2884, i64 1
+  %tmp2886 = getelementptr inbounds float* %tmp2885, i64 1
+  %tmp2887 = getelementptr inbounds float* %tmp2886, i64 1
+  %tmp2888 = getelementptr inbounds float* %tmp2887, i64 1
+  %tmp2889 = getelementptr inbounds float* %tmp2888, i64 1
+  %tmp2890 = getelementptr inbounds float* %tmp2889, i64 1
+  %tmp2891 = getelementptr inbounds float* %tmp2890, i64 1
+  %tmp2892 = getelementptr inbounds float* %tmp2891, i64 1
+  %tmp2893 = getelementptr inbounds float* %tmp2892, i64 1
+  %tmp2894 = getelementptr inbounds float* %tmp2893, i64 1
+  %tmp2895 = getelementptr inbounds float* %tmp2894, i64 1
+  %tmp2896 = getelementptr inbounds float* %tmp2895, i64 1
+  %tmp2897 = getelementptr inbounds float* %tmp2896, i64 1
+  %tmp2898 = getelementptr inbounds float* %tmp2897, i64 1
+  %tmp2899 = getelementptr inbounds float* %tmp2898, i64 1
+  %tmp2900 = getelementptr inbounds float* %tmp2899, i64 1
+  %tmp2901 = getelementptr inbounds float* %tmp2900, i64 1
+  %tmp2902 = getelementptr inbounds float* %tmp2901, i64 1
+  %tmp2903 = getelementptr inbounds float* %tmp2902, i64 1
+  %tmp2904 = getelementptr inbounds float* %tmp2903, i64 1
+  %tmp2905 = getelementptr inbounds float* %tmp2904, i64 1
+  %tmp2906 = getelementptr inbounds float* %tmp2905, i64 1
+  %tmp2907 = getelementptr inbounds float* %tmp2906, i64 1
+  %tmp2908 = getelementptr inbounds float* %tmp2907, i64 1
+  %tmp2909 = getelementptr inbounds float* %tmp2908, i64 1
+  %tmp2910 = getelementptr inbounds float* %tmp2909, i64 1
+  %tmp2911 = getelementptr inbounds float* %tmp2910, i64 1
+  %tmp2912 = getelementptr inbounds float* %tmp2911, i64 1
+  %tmp2913 = getelementptr inbounds float* %tmp2912, i64 1
+  %tmp2914 = getelementptr inbounds float* %tmp2913, i64 1
+  %tmp2915 = getelementptr inbounds float* %tmp2914, i64 1
+  %tmp2916 = getelementptr inbounds float* %tmp2915, i64 1
+  %tmp2917 = getelementptr inbounds float* %tmp2916, i64 1
+  %tmp2918 = getelementptr inbounds float* %tmp2917, i64 1
+  %tmp2919 = getelementptr inbounds float* %tmp2918, i64 1
+  %tmp2920 = getelementptr inbounds float* %tmp2919, i64 1
+  %tmp2921 = getelementptr inbounds float* %tmp2920, i64 1
+  %tmp2922 = getelementptr inbounds float* %tmp2921, i64 1
+  %tmp2923 = getelementptr inbounds float* %tmp2922, i64 1
+  %tmp2924 = getelementptr inbounds float* %tmp2923, i64 1
+  %tmp2925 = getelementptr inbounds float* %tmp2924, i64 1
+  %tmp2926 = getelementptr inbounds float* %tmp2925, i64 1
+  %tmp2927 = getelementptr inbounds float* %tmp2926, i64 1
+  %tmp2928 = getelementptr inbounds float* %tmp2927, i64 1
+  %tmp2929 = getelementptr inbounds float* %tmp2928, i64 1
+  %tmp2930 = getelementptr inbounds float* %tmp2929, i64 1
+  %tmp2931 = getelementptr inbounds float* %tmp2930, i64 1
+  %tmp2932 = getelementptr inbounds float* %tmp2931, i64 1
+  %tmp2933 = getelementptr inbounds float* %tmp2932, i64 1
+  %tmp2934 = getelementptr inbounds float* %tmp2933, i64 1
+  %tmp2935 = getelementptr inbounds float* %tmp2934, i64 1
+  %tmp2936 = getelementptr inbounds float* %tmp2935, i64 1
+  %tmp2937 = getelementptr inbounds float* %tmp2936, i64 1
+  %tmp2938 = getelementptr inbounds float* %tmp2937, i64 1
+  %tmp2939 = getelementptr inbounds float* %tmp2938, i64 1
+  %tmp2940 = getelementptr inbounds float* %tmp2939, i64 1
+  %tmp2941 = getelementptr inbounds float* %tmp2940, i64 1
+  %tmp2942 = getelementptr inbounds float* %tmp2941, i64 1
+  %tmp2943 = getelementptr inbounds float* %tmp2942, i64 1
+  %tmp2944 = getelementptr inbounds float* %tmp2943, i64 1
+  %tmp2945 = getelementptr inbounds float* %tmp2944, i64 1
+  %tmp2946 = getelementptr inbounds float* %tmp2945, i64 1
+  %tmp2947 = getelementptr inbounds float* %tmp2946, i64 1
+  %tmp2948 = getelementptr inbounds float* %tmp2947, i64 1
+  %tmp2949 = getelementptr inbounds float* %tmp2948, i64 1
+  %tmp2950 = getelementptr inbounds float* %tmp2949, i64 1
+  %tmp2951 = getelementptr inbounds float* %tmp2950, i64 1
+  %tmp2952 = getelementptr inbounds float* %tmp2951, i64 1
+  %tmp2953 = getelementptr inbounds float* %tmp2952, i64 1
+  %tmp2954 = getelementptr inbounds float* %tmp2953, i64 1
+  %tmp2955 = getelementptr inbounds float* %tmp2954, i64 1
+  %tmp2956 = getelementptr inbounds float* %tmp2955, i64 1
+  %tmp2957 = getelementptr inbounds float* %tmp2956, i64 1
+  %tmp2958 = getelementptr inbounds float* %tmp2957, i64 1
+  %tmp2959 = getelementptr inbounds float* %tmp2958, i64 1
+  %tmp2960 = getelementptr inbounds float* %tmp2959, i64 1
+  %tmp2961 = getelementptr inbounds float* %tmp2960, i64 1
+  %tmp2962 = getelementptr inbounds float* %tmp2961, i64 1
+  %tmp2963 = getelementptr inbounds float* %tmp2962, i64 1
+  %tmp2964 = getelementptr inbounds float* %tmp2963, i64 1
+  %tmp2965 = getelementptr inbounds float* %tmp2964, i64 1
+  %tmp2966 = getelementptr inbounds float* %tmp2965, i64 1
+  %tmp2967 = getelementptr inbounds float* %tmp2966, i64 1
+  %tmp2968 = getelementptr inbounds float* %tmp2967, i64 1
+  %tmp2969 = getelementptr inbounds float* %tmp2968, i64 1
+  %tmp2970 = getelementptr inbounds float* %tmp2969, i64 1
+  %tmp2971 = getelementptr inbounds float* %tmp2970, i64 1
+  %tmp2972 = getelementptr inbounds float* %tmp2971, i64 1
+  %tmp2973 = getelementptr inbounds float* %tmp2972, i64 1
+  %tmp2974 = getelementptr inbounds float* %tmp2973, i64 1
+  %tmp2975 = getelementptr inbounds float* %tmp2974, i64 1
+  %tmp2976 = getelementptr inbounds float* %tmp2975, i64 1
+  %tmp2977 = getelementptr inbounds float* %tmp2976, i64 1
+  %tmp2978 = getelementptr inbounds float* %tmp2977, i64 1
+  %tmp2979 = getelementptr inbounds float* %tmp2978, i64 1
+  %tmp2980 = getelementptr inbounds float* %tmp2979, i64 1
+  %tmp2981 = getelementptr inbounds float* %tmp2980, i64 1
+  %tmp2982 = getelementptr inbounds float* %tmp2981, i64 1
+  %tmp2983 = getelementptr inbounds float* %tmp2982, i64 1
+  %tmp2984 = getelementptr inbounds float* %tmp2983, i64 1
+  %tmp2985 = getelementptr inbounds float* %tmp2984, i64 1
+  %tmp2986 = getelementptr inbounds float* %tmp2985, i64 1
+  %tmp2987 = getelementptr inbounds float* %tmp2986, i64 1
+  %tmp2988 = getelementptr inbounds float* %tmp2987, i64 1
+  %tmp2989 = getelementptr inbounds float* %tmp2988, i64 1
+  %tmp2990 = getelementptr inbounds float* %tmp2989, i64 1
+  %tmp2991 = getelementptr inbounds float* %tmp2990, i64 1
+  %tmp2992 = getelementptr inbounds float* %tmp2991, i64 1
+  %tmp2993 = getelementptr inbounds float* %tmp2992, i64 1
+  %tmp2994 = getelementptr inbounds float* %tmp2993, i64 1
+  %tmp2995 = getelementptr inbounds float* %tmp2994, i64 1
+  %tmp2996 = getelementptr inbounds float* %tmp2995, i64 1
+  %tmp2997 = getelementptr inbounds float* %tmp2996, i64 1
+  %tmp2998 = getelementptr inbounds float* %tmp2997, i64 1
+  %tmp2999 = getelementptr inbounds float* %tmp2998, i64 1
+  %tmp3000 = getelementptr inbounds float* %tmp2999, i64 1
+  %tmp3001 = getelementptr inbounds float* %tmp3000, i64 1
+  %tmp3002 = getelementptr inbounds float* %tmp3001, i64 1
+  %tmp3003 = getelementptr inbounds float* %tmp3002, i64 1
+  %tmp3004 = getelementptr inbounds float* %tmp3003, i64 1
+  %tmp3005 = getelementptr inbounds float* %tmp3004, i64 1
+  %tmp3006 = getelementptr inbounds float* %tmp3005, i64 1
+  %tmp3007 = getelementptr inbounds float* %tmp3006, i64 1
+  %tmp3008 = getelementptr inbounds float* %tmp3007, i64 1
+  %tmp3009 = getelementptr inbounds float* %tmp3008, i64 1
+  %tmp3010 = getelementptr inbounds float* %tmp3009, i64 1
+  %tmp3011 = getelementptr inbounds float* %tmp3010, i64 1
+  %tmp3012 = getelementptr inbounds float* %tmp3011, i64 1
+  %tmp3013 = getelementptr inbounds float* %tmp3012, i64 1
+  %tmp3014 = getelementptr inbounds float* %tmp3013, i64 1
+  %tmp3015 = getelementptr inbounds float* %tmp3014, i64 1
+  %tmp3016 = getelementptr inbounds float* %tmp3015, i64 1
+  %tmp3017 = getelementptr inbounds float* %tmp3016, i64 1
+  %tmp3018 = getelementptr inbounds float* %tmp3017, i64 1
+  %tmp3019 = getelementptr inbounds float* %tmp3018, i64 1
+  %tmp3020 = getelementptr inbounds float* %tmp3019, i64 1
+  %tmp3021 = getelementptr inbounds float* %tmp3020, i64 1
+  %tmp3022 = getelementptr inbounds float* %tmp3021, i64 1
+  %tmp3023 = getelementptr inbounds float* %tmp3022, i64 1
+  %tmp3024 = getelementptr inbounds float* %tmp3023, i64 1
+  %tmp3025 = getelementptr inbounds float* %tmp3024, i64 1
+  %tmp3026 = getelementptr inbounds float* %tmp3025, i64 1
+  %tmp3027 = getelementptr inbounds float* %tmp3026, i64 1
+  %tmp3028 = getelementptr inbounds float* %tmp3027, i64 1
+  %tmp3029 = getelementptr inbounds float* %tmp3028, i64 1
+  %tmp3030 = getelementptr inbounds float* %tmp3029, i64 1
+  %tmp3031 = getelementptr inbounds float* %tmp3030, i64 1
+  %tmp3032 = getelementptr inbounds float* %tmp3031, i64 1
+  %tmp3033 = getelementptr inbounds float* %tmp3032, i64 1
+  %tmp3034 = getelementptr inbounds float* %tmp3033, i64 1
+  %tmp3035 = getelementptr inbounds float* %tmp3034, i64 1
+  %tmp3036 = getelementptr inbounds float* %tmp3035, i64 1
+  %tmp3037 = getelementptr inbounds float* %tmp3036, i64 1
+  %tmp3038 = getelementptr inbounds float* %tmp3037, i64 1
+  %tmp3039 = getelementptr inbounds float* %tmp3038, i64 1
+  %tmp3040 = getelementptr inbounds float* %tmp3039, i64 1
+  %tmp3041 = getelementptr inbounds float* %tmp3040, i64 1
+  %tmp3042 = getelementptr inbounds float* %tmp3041, i64 1
+  %tmp3043 = getelementptr inbounds float* %tmp3042, i64 1
+  %tmp3044 = getelementptr inbounds float* %tmp3043, i64 1
+  %tmp3045 = getelementptr inbounds float* %tmp3044, i64 1
+  %tmp3046 = getelementptr inbounds float* %tmp3045, i64 1
+  %tmp3047 = getelementptr inbounds float* %tmp3046, i64 1
+  %tmp3048 = getelementptr inbounds float* %tmp3047, i64 1
+  %tmp3049 = getelementptr inbounds float* %tmp3048, i64 1
+  %tmp3050 = getelementptr inbounds float* %tmp3049, i64 1
+  %tmp3051 = getelementptr inbounds float* %tmp3050, i64 1
+  %tmp3052 = getelementptr inbounds float* %tmp3051, i64 1
+  %tmp3053 = getelementptr inbounds float* %tmp3052, i64 1
+  %tmp3054 = getelementptr inbounds float* %tmp3053, i64 1
+  %tmp3055 = getelementptr inbounds float* %tmp3054, i64 1
+  %tmp3056 = getelementptr inbounds float* %tmp3055, i64 1
+  %tmp3057 = getelementptr inbounds float* %tmp3056, i64 1
+  %tmp3058 = getelementptr inbounds float* %tmp3057, i64 1
+  %tmp3059 = getelementptr inbounds float* %tmp3058, i64 1
+  %tmp3060 = getelementptr inbounds float* %tmp3059, i64 1
+  %tmp3061 = getelementptr inbounds float* %tmp3060, i64 1
+  %tmp3062 = getelementptr inbounds float* %tmp3061, i64 1
+  %tmp3063 = getelementptr inbounds float* %tmp3062, i64 1
+  %tmp3064 = getelementptr inbounds float* %tmp3063, i64 1
+  %tmp3065 = getelementptr inbounds float* %tmp3064, i64 1
+  %tmp3066 = getelementptr inbounds float* %tmp3065, i64 1
+  %tmp3067 = getelementptr inbounds float* %tmp3066, i64 1
+  %tmp3068 = getelementptr inbounds float* %tmp3067, i64 1
+  %tmp3069 = getelementptr inbounds float* %tmp3068, i64 1
+  %tmp3070 = getelementptr inbounds float* %tmp3069, i64 1
+  %tmp3071 = getelementptr inbounds float* %tmp3070, i64 1
+  %tmp3072 = getelementptr inbounds float* %tmp3071, i64 1
+  %tmp3073 = getelementptr inbounds float* %tmp3072, i64 1
+  %tmp3074 = getelementptr inbounds float* %tmp3073, i64 1
+  %tmp3075 = getelementptr inbounds float* %tmp3074, i64 1
+  %tmp3076 = getelementptr inbounds float* %tmp3075, i64 1
+  %tmp3077 = getelementptr inbounds float* %tmp3076, i64 1
+  %tmp3078 = getelementptr inbounds float* %tmp3077, i64 1
+  %tmp3079 = getelementptr inbounds float* %tmp3078, i64 1
+  %tmp3080 = getelementptr inbounds float* %tmp3079, i64 1
+  %tmp3081 = getelementptr inbounds float* %tmp3080, i64 1
+  %tmp3082 = getelementptr inbounds float* %tmp3081, i64 1
+  %tmp3083 = getelementptr inbounds float* %tmp3082, i64 1
+  %tmp3084 = getelementptr inbounds float* %tmp3083, i64 1
+  %tmp3085 = getelementptr inbounds float* %tmp3084, i64 1
+  %tmp3086 = getelementptr inbounds float* %tmp3085, i64 1
+  %tmp3087 = getelementptr inbounds float* %tmp3086, i64 1
+  %tmp3088 = getelementptr inbounds float* %tmp3087, i64 1
+  %tmp3089 = getelementptr inbounds float* %tmp3088, i64 1
+  %tmp3090 = getelementptr inbounds float* %tmp3089, i64 1
+  %tmp3091 = getelementptr inbounds float* %tmp3090, i64 1
+  %tmp3092 = getelementptr inbounds float* %tmp3091, i64 1
+  %tmp3093 = getelementptr inbounds float* %tmp3092, i64 1
+  %tmp3094 = getelementptr inbounds float* %tmp3093, i64 1
+  %tmp3095 = getelementptr inbounds float* %tmp3094, i64 1
+  %tmp3096 = getelementptr inbounds float* %tmp3095, i64 1
+  %tmp3097 = getelementptr inbounds float* %tmp3096, i64 1
+  %tmp3098 = getelementptr inbounds float* %tmp3097, i64 1
+  %tmp3099 = getelementptr inbounds float* %tmp3098, i64 1
+  %tmp3100 = getelementptr inbounds float* %tmp3099, i64 1
+  %tmp3101 = getelementptr inbounds float* %tmp3100, i64 1
+  %tmp3102 = getelementptr inbounds float* %tmp3101, i64 1
+  %tmp3103 = getelementptr inbounds float* %tmp3102, i64 1
+  %tmp3104 = getelementptr inbounds float* %tmp3103, i64 1
+  %tmp3105 = getelementptr inbounds float* %tmp3104, i64 1
+  %tmp3106 = getelementptr inbounds float* %tmp3105, i64 1
+  %tmp3107 = getelementptr inbounds float* %tmp3106, i64 1
+  %tmp3108 = getelementptr inbounds float* %tmp3107, i64 1
+  %tmp3109 = getelementptr inbounds float* %tmp3108, i64 1
+  %tmp3110 = getelementptr inbounds float* %tmp3109, i64 1
+  %tmp3111 = getelementptr inbounds float* %tmp3110, i64 1
+  %tmp3112 = getelementptr inbounds float* %tmp3111, i64 1
+  %tmp3113 = getelementptr inbounds float* %tmp3112, i64 1
+  %tmp3114 = getelementptr inbounds float* %tmp3113, i64 1
+  %tmp3115 = getelementptr inbounds float* %tmp3114, i64 1
+  %tmp3116 = getelementptr inbounds float* %tmp3115, i64 1
+  %tmp3117 = getelementptr inbounds float* %tmp3116, i64 1
+  %tmp3118 = getelementptr inbounds float* %tmp3117, i64 1
+  %tmp3119 = getelementptr inbounds float* %tmp3118, i64 1
+  %tmp3120 = getelementptr inbounds float* %tmp3119, i64 1
+  %tmp3121 = getelementptr inbounds float* %tmp3120, i64 1
+  %tmp3122 = getelementptr inbounds float* %tmp3121, i64 1
+  %tmp3123 = getelementptr inbounds float* %tmp3122, i64 1
+  %tmp3124 = getelementptr inbounds float* %tmp3123, i64 1
+  %tmp3125 = getelementptr inbounds float* %tmp3124, i64 1
+  %tmp3126 = getelementptr inbounds float* %tmp3125, i64 1
+  %tmp3127 = getelementptr inbounds float* %tmp3126, i64 1
+  %tmp3128 = getelementptr inbounds float* %tmp3127, i64 1
+  %tmp3129 = getelementptr inbounds float* %tmp3128, i64 1
+  %tmp3130 = getelementptr inbounds float* %tmp3129, i64 1
+  %tmp3131 = getelementptr inbounds float* %tmp3130, i64 1
+  %tmp3132 = getelementptr inbounds float* %tmp3131, i64 1
+  %tmp3133 = getelementptr inbounds float* %tmp3132, i64 1
+  %tmp3134 = getelementptr inbounds float* %tmp3133, i64 1
+  %tmp3135 = getelementptr inbounds float* %tmp3134, i64 1
+  %tmp3136 = getelementptr inbounds float* %tmp3135, i64 1
+  %tmp3137 = getelementptr inbounds float* %tmp3136, i64 1
+  %tmp3138 = getelementptr inbounds float* %tmp3137, i64 1
+  %tmp3139 = getelementptr inbounds float* %tmp3138, i64 1
+  %tmp3140 = getelementptr inbounds float* %tmp3139, i64 1
+  %tmp3141 = getelementptr inbounds float* %tmp3140, i64 1
+  %tmp3142 = getelementptr inbounds float* %tmp3141, i64 1
+  %tmp3143 = getelementptr inbounds float* %tmp3142, i64 1
+  %tmp3144 = getelementptr inbounds float* %tmp3143, i64 1
+  %tmp3145 = getelementptr inbounds float* %tmp3144, i64 1
+  %tmp3146 = getelementptr inbounds float* %tmp3145, i64 1
+  %tmp3147 = getelementptr inbounds float* %tmp3146, i64 1
+  %tmp3148 = getelementptr inbounds float* %tmp3147, i64 1
+  %tmp3149 = getelementptr inbounds float* %tmp3148, i64 1
+  %tmp3150 = getelementptr inbounds float* %tmp3149, i64 1
+  %tmp3151 = getelementptr inbounds float* %tmp3150, i64 1
+  %tmp3152 = getelementptr inbounds float* %tmp3151, i64 1
+  %tmp3153 = getelementptr inbounds float* %tmp3152, i64 1
+  %tmp3154 = getelementptr inbounds float* %tmp3153, i64 1
+  %tmp3155 = getelementptr inbounds float* %tmp3154, i64 1
+  %tmp3156 = getelementptr inbounds float* %tmp3155, i64 1
+  %tmp3157 = getelementptr inbounds float* %tmp3156, i64 1
+  %tmp3158 = getelementptr inbounds float* %tmp3157, i64 1
+  %tmp3159 = getelementptr inbounds float* %tmp3158, i64 1
+  %tmp3160 = getelementptr inbounds float* %tmp3159, i64 1
+  %tmp3161 = getelementptr inbounds float* %tmp3160, i64 1
+  %tmp3162 = getelementptr inbounds float* %tmp3161, i64 1
+  %tmp3163 = getelementptr inbounds float* %tmp3162, i64 1
+  %tmp3164 = getelementptr inbounds float* %tmp3163, i64 1
+  %tmp3165 = getelementptr inbounds float* %tmp3164, i64 1
+  %tmp3166 = getelementptr inbounds float* %tmp3165, i64 1
+  %tmp3167 = getelementptr inbounds float* %tmp3166, i64 1
+  %tmp3168 = getelementptr inbounds float* %tmp3167, i64 1
+  %tmp3169 = getelementptr inbounds float* %tmp3168, i64 1
+  %tmp3170 = getelementptr inbounds float* %tmp3169, i64 1
+  %tmp3171 = getelementptr inbounds float* %tmp3170, i64 1
+  %tmp3172 = getelementptr inbounds float* %tmp3171, i64 1
+  %tmp3173 = getelementptr inbounds float* %tmp3172, i64 1
+  %tmp3174 = getelementptr inbounds float* %tmp3173, i64 1
+  %tmp3175 = getelementptr inbounds float* %tmp3174, i64 1
+  %tmp3176 = getelementptr inbounds float* %tmp3175, i64 1
+  %tmp3177 = getelementptr inbounds float* %tmp3176, i64 1
+  %tmp3178 = getelementptr inbounds float* %tmp3177, i64 1
+  %tmp3179 = getelementptr inbounds float* %tmp3178, i64 1
+  %tmp3180 = getelementptr inbounds float* %tmp3179, i64 1
+  %tmp3181 = getelementptr inbounds float* %tmp3180, i64 1
+  %tmp3182 = getelementptr inbounds float* %tmp3181, i64 1
+  %tmp3183 = getelementptr inbounds float* %tmp3182, i64 1
+  %tmp3184 = getelementptr inbounds float* %tmp3183, i64 1
+  %tmp3185 = getelementptr inbounds float* %tmp3184, i64 1
+  %tmp3186 = getelementptr inbounds float* %tmp3185, i64 1
+  %tmp3187 = getelementptr inbounds float* %tmp3186, i64 1
+  %tmp3188 = getelementptr inbounds float* %tmp3187, i64 1
+  %tmp3189 = getelementptr inbounds float* %tmp3188, i64 1
+  %tmp3190 = getelementptr inbounds float* %tmp3189, i64 1
+  %tmp3191 = getelementptr inbounds float* %tmp3190, i64 1
+  %tmp3192 = getelementptr inbounds float* %tmp3191, i64 1
+  %tmp3193 = getelementptr inbounds float* %tmp3192, i64 1
+  %tmp3194 = getelementptr inbounds float* %tmp3193, i64 1
+  %tmp3195 = getelementptr inbounds float* %tmp3194, i64 1
+  %tmp3196 = getelementptr inbounds float* %tmp3195, i64 1
+  %tmp3197 = getelementptr inbounds float* %tmp3196, i64 1
+  %tmp3198 = getelementptr inbounds float* %tmp3197, i64 1
+  %tmp3199 = getelementptr inbounds float* %tmp3198, i64 1
+  %tmp3200 = getelementptr inbounds float* %tmp3199, i64 1
+  %tmp3201 = getelementptr inbounds float* %tmp3200, i64 1
+  %tmp3202 = getelementptr inbounds float* %tmp3201, i64 1
+  %tmp3203 = getelementptr inbounds float* %tmp3202, i64 1
+  %tmp3204 = getelementptr inbounds float* %tmp3203, i64 1
+  %tmp3205 = getelementptr inbounds float* %tmp3204, i64 1
+  %tmp3206 = getelementptr inbounds float* %tmp3205, i64 1
+  %tmp3207 = getelementptr inbounds float* %tmp3206, i64 1
+  %tmp3208 = getelementptr inbounds float* %tmp3207, i64 1
+  %tmp3209 = getelementptr inbounds float* %tmp3208, i64 1
+  %tmp3210 = getelementptr inbounds float* %tmp3209, i64 1
+  %tmp3211 = getelementptr inbounds float* %tmp3210, i64 1
+  %tmp3212 = getelementptr inbounds float* %tmp3211, i64 1
+  %tmp3213 = getelementptr inbounds float* %tmp3212, i64 1
+  %tmp3214 = getelementptr inbounds float* %tmp3213, i64 1
+  %tmp3215 = getelementptr inbounds float* %tmp3214, i64 1
+  %tmp3216 = getelementptr inbounds float* %tmp3215, i64 1
+  %tmp3217 = getelementptr inbounds float* %tmp3216, i64 1
+  %tmp3218 = getelementptr inbounds float* %tmp3217, i64 1
+  %tmp3219 = getelementptr inbounds float* %tmp3218, i64 1
+  %tmp3220 = getelementptr inbounds float* %tmp3219, i64 1
+  %tmp3221 = getelementptr inbounds float* %tmp3220, i64 1
+  %tmp3222 = getelementptr inbounds float* %tmp3221, i64 1
+  %tmp3223 = getelementptr inbounds float* %tmp3222, i64 1
+  %tmp3224 = getelementptr inbounds float* %tmp3223, i64 1
+  %tmp3225 = getelementptr inbounds float* %tmp3224, i64 1
+  %tmp3226 = getelementptr inbounds float* %tmp3225, i64 1
+  %tmp3227 = getelementptr inbounds float* %tmp3226, i64 1
+  %tmp3228 = getelementptr inbounds float* %tmp3227, i64 1
+  %tmp3229 = getelementptr inbounds float* %tmp3228, i64 1
+  %tmp3230 = getelementptr inbounds float* %tmp3229, i64 1
+  %tmp3231 = getelementptr inbounds float* %tmp3230, i64 1
+  %tmp3232 = getelementptr inbounds float* %tmp3231, i64 1
+  %tmp3233 = getelementptr inbounds float* %tmp3232, i64 1
+  %tmp3234 = getelementptr inbounds float* %tmp3233, i64 1
+  %tmp3235 = getelementptr inbounds float* %tmp3234, i64 1
+  %tmp3236 = getelementptr inbounds float* %tmp3235, i64 1
+  %tmp3237 = getelementptr inbounds float* %tmp3236, i64 1
+  %tmp3238 = getelementptr inbounds float* %tmp3237, i64 1
+  %tmp3239 = getelementptr inbounds float* %tmp3238, i64 1
+  %tmp3240 = getelementptr inbounds float* %tmp3239, i64 1
+  %tmp3241 = getelementptr inbounds float* %tmp3240, i64 1
+  %tmp3242 = getelementptr inbounds float* %tmp3241, i64 1
+  %tmp3243 = getelementptr inbounds float* %tmp3242, i64 1
+  %tmp3244 = getelementptr inbounds float* %tmp3243, i64 1
+  %tmp3245 = getelementptr inbounds float* %tmp3244, i64 1
+  %tmp3246 = getelementptr inbounds float* %tmp3245, i64 1
+  %tmp3247 = getelementptr inbounds float* %tmp3246, i64 1
+  %tmp3248 = getelementptr inbounds float* %tmp3247, i64 1
+  %tmp3249 = getelementptr inbounds float* %tmp3248, i64 1
+  %tmp3250 = getelementptr inbounds float* %tmp3249, i64 1
+  %tmp3251 = getelementptr inbounds float* %tmp3250, i64 1
+  %tmp3252 = getelementptr inbounds float* %tmp3251, i64 1
+  %tmp3253 = getelementptr inbounds float* %tmp3252, i64 1
+  %tmp3254 = getelementptr inbounds float* %tmp3253, i64 1
+  %tmp3255 = getelementptr inbounds float* %tmp3254, i64 1
+  %tmp3256 = getelementptr inbounds float* %tmp3255, i64 1
+  %tmp3257 = getelementptr inbounds float* %tmp3256, i64 1
+  %tmp3258 = getelementptr inbounds float* %tmp3257, i64 1
+  %tmp3259 = getelementptr inbounds float* %tmp3258, i64 1
+  %tmp3260 = getelementptr inbounds float* %tmp3259, i64 1
+  %tmp3261 = getelementptr inbounds float* %tmp3260, i64 1
+  %tmp3262 = getelementptr inbounds float* %tmp3261, i64 1
+  %tmp3263 = getelementptr inbounds float* %tmp3262, i64 1
+  %tmp3264 = getelementptr inbounds float* %tmp3263, i64 1
+  %tmp3265 = getelementptr inbounds float* %tmp3264, i64 1
+  %tmp3266 = getelementptr inbounds float* %tmp3265, i64 1
+  %tmp3267 = getelementptr inbounds float* %tmp3266, i64 1
+  %tmp3268 = getelementptr inbounds float* %tmp3267, i64 1
+  %tmp3269 = getelementptr inbounds float* %tmp3268, i64 1
+  %tmp3270 = getelementptr inbounds float* %tmp3269, i64 1
+  %tmp3271 = getelementptr inbounds float* %tmp3270, i64 1
+  %tmp3272 = getelementptr inbounds float* %tmp3271, i64 1
+  %tmp3273 = getelementptr inbounds float* %tmp3272, i64 1
+  %tmp3274 = getelementptr inbounds float* %tmp3273, i64 1
+  %tmp3275 = getelementptr inbounds float* %tmp3274, i64 1
+  %tmp3276 = getelementptr inbounds float* %tmp3275, i64 1
+  %tmp3277 = getelementptr inbounds float* %tmp3276, i64 1
+  %tmp3278 = getelementptr inbounds float* %tmp3277, i64 1
+  %tmp3279 = getelementptr inbounds float* %tmp3278, i64 1
+  %tmp3280 = getelementptr inbounds float* %tmp3279, i64 1
+  %tmp3281 = getelementptr inbounds float* %tmp3280, i64 1
+  %tmp3282 = getelementptr inbounds float* %tmp3281, i64 1
+  %tmp3283 = getelementptr inbounds float* %tmp3282, i64 1
+  %tmp3284 = getelementptr inbounds float* %tmp3283, i64 1
+  %tmp3285 = getelementptr inbounds float* %tmp3284, i64 1
+  %tmp3286 = getelementptr inbounds float* %tmp3285, i64 1
+  %tmp3287 = getelementptr inbounds float* %tmp3286, i64 1
+  %tmp3288 = getelementptr inbounds float* %tmp3287, i64 1
+  %tmp3289 = getelementptr inbounds float* %tmp3288, i64 1
+  %tmp3290 = getelementptr inbounds float* %tmp3289, i64 1
+  %tmp3291 = getelementptr inbounds float* %tmp3290, i64 1
+  %tmp3292 = getelementptr inbounds float* %tmp3291, i64 1
+  %tmp3293 = getelementptr inbounds float* %tmp3292, i64 1
+  %tmp3294 = getelementptr inbounds float* %tmp3293, i64 1
+  %tmp3295 = getelementptr inbounds float* %tmp3294, i64 1
+  %tmp3296 = getelementptr inbounds float* %tmp3295, i64 1
+  %tmp3297 = getelementptr inbounds float* %tmp3296, i64 1
+  %tmp3298 = getelementptr inbounds float* %tmp3297, i64 1
+  %tmp3299 = getelementptr inbounds float* %tmp3298, i64 1
+  %tmp3300 = getelementptr inbounds float* %tmp3299, i64 1
+  %tmp3301 = getelementptr inbounds float* %tmp3300, i64 1
+  %tmp3302 = getelementptr inbounds float* %tmp3301, i64 1
+  %tmp3303 = getelementptr inbounds float* %tmp3302, i64 1
+  %tmp3304 = getelementptr inbounds float* %tmp3303, i64 1
+  %tmp3305 = getelementptr inbounds float* %tmp3304, i64 1
+  %tmp3306 = getelementptr inbounds float* %tmp3305, i64 1
+  %tmp3307 = getelementptr inbounds float* %tmp3306, i64 1
+  %tmp3308 = getelementptr inbounds float* %tmp3307, i64 1
+  %tmp3309 = getelementptr inbounds float* %tmp3308, i64 1
+  %tmp3310 = getelementptr inbounds float* %tmp3309, i64 1
+  %tmp3311 = getelementptr inbounds float* %tmp3310, i64 1
+  %tmp3312 = getelementptr inbounds float* %tmp3311, i64 1
+  %tmp3313 = getelementptr inbounds float* %tmp3312, i64 1
+  %tmp3314 = getelementptr inbounds float* %tmp3313, i64 1
+  %tmp3315 = getelementptr inbounds float* %tmp3314, i64 1
+  %tmp3316 = getelementptr inbounds float* %tmp3315, i64 1
+  %tmp3317 = getelementptr inbounds float* %tmp3316, i64 1
+  %tmp3318 = getelementptr inbounds float* %tmp3317, i64 1
+  %tmp3319 = getelementptr inbounds float* %tmp3318, i64 1
+  %tmp3320 = getelementptr inbounds float* %tmp3319, i64 1
+  %tmp3321 = getelementptr inbounds float* %tmp3320, i64 1
+  %tmp3322 = getelementptr inbounds float* %tmp3321, i64 1
+  %tmp3323 = getelementptr inbounds float* %tmp3322, i64 1
+  %tmp3324 = getelementptr inbounds float* %tmp3323, i64 1
+  %tmp3325 = getelementptr inbounds float* %tmp3324, i64 1
+  %tmp3326 = getelementptr inbounds float* %tmp3325, i64 1
+  %tmp3327 = getelementptr inbounds float* %tmp3326, i64 1
+  %tmp3328 = getelementptr inbounds float* %tmp3327, i64 1
+  %tmp3329 = getelementptr inbounds float* %tmp3328, i64 1
+  %tmp3330 = getelementptr inbounds float* %tmp3329, i64 1
+  %tmp3331 = getelementptr inbounds float* %tmp3330, i64 1
+  %tmp3332 = getelementptr inbounds float* %tmp3331, i64 1
+  %tmp3333 = getelementptr inbounds float* %tmp3332, i64 1
+  %tmp3334 = getelementptr inbounds float* %tmp3333, i64 1
+  %tmp3335 = getelementptr inbounds float* %tmp3334, i64 1
+  %tmp3336 = getelementptr inbounds float* %tmp3335, i64 1
+  %tmp3337 = getelementptr inbounds float* %tmp3336, i64 1
+  %tmp3338 = getelementptr inbounds float* %tmp3337, i64 1
+  %tmp3339 = getelementptr inbounds float* %tmp3338, i64 1
+  %tmp3340 = getelementptr inbounds float* %tmp3339, i64 1
+  %tmp3341 = getelementptr inbounds float* %tmp3340, i64 1
+  %tmp3342 = getelementptr inbounds float* %tmp3341, i64 1
+  %tmp3343 = getelementptr inbounds float* %tmp3342, i64 1
+  %tmp3344 = getelementptr inbounds float* %tmp3343, i64 1
+  %tmp3345 = getelementptr inbounds float* %tmp3344, i64 1
+  %tmp3346 = getelementptr inbounds float* %tmp3345, i64 1
+  %tmp3347 = getelementptr inbounds float* %tmp3346, i64 1
+  %tmp3348 = getelementptr inbounds float* %tmp3347, i64 1
+  %tmp3349 = getelementptr inbounds float* %tmp3348, i64 1
+  %tmp3350 = getelementptr inbounds float* %tmp3349, i64 1
+  %tmp3351 = getelementptr inbounds float* %tmp3350, i64 1
+  %tmp3352 = getelementptr inbounds float* %tmp3351, i64 1
+  %tmp3353 = getelementptr inbounds float* %tmp3352, i64 1
+  %tmp3354 = getelementptr inbounds float* %tmp3353, i64 1
+  %tmp3355 = getelementptr inbounds float* %tmp3354, i64 1
+  %tmp3356 = getelementptr inbounds float* %tmp3355, i64 1
+  %tmp3357 = getelementptr inbounds float* %tmp3356, i64 1
+  %tmp3358 = getelementptr inbounds float* %tmp3357, i64 1
+  %tmp3359 = getelementptr inbounds float* %tmp3358, i64 1
+  %tmp3360 = getelementptr inbounds float* %tmp3359, i64 1
+  %tmp3361 = getelementptr inbounds float* %tmp3360, i64 1
+  %tmp3362 = getelementptr inbounds float* %tmp3361, i64 1
+  %tmp3363 = getelementptr inbounds float* %tmp3362, i64 1
+  %tmp3364 = getelementptr inbounds float* %tmp3363, i64 1
+  %tmp3365 = getelementptr inbounds float* %tmp3364, i64 1
+  %tmp3366 = getelementptr inbounds float* %tmp3365, i64 1
+  %tmp3367 = getelementptr inbounds float* %tmp3366, i64 1
+  %tmp3368 = getelementptr inbounds float* %tmp3367, i64 1
+  %tmp3369 = getelementptr inbounds float* %tmp3368, i64 1
+  %tmp3370 = getelementptr inbounds float* %tmp3369, i64 1
+  %tmp3371 = getelementptr inbounds float* %tmp3370, i64 1
+  %tmp3372 = getelementptr inbounds float* %tmp3371, i64 1
+  %tmp3373 = getelementptr inbounds float* %tmp3372, i64 1
+  %tmp3374 = getelementptr inbounds float* %tmp3373, i64 1
+  %tmp3375 = getelementptr inbounds float* %tmp3374, i64 1
+  %tmp3376 = getelementptr inbounds float* %tmp3375, i64 1
+  %tmp3377 = getelementptr inbounds float* %tmp3376, i64 1
+  %tmp3378 = getelementptr inbounds float* %tmp3377, i64 1
+  %tmp3379 = getelementptr inbounds float* %tmp3378, i64 1
+  %tmp3380 = getelementptr inbounds float* %tmp3379, i64 1
+  %tmp3381 = getelementptr inbounds float* %tmp3380, i64 1
+  %tmp3382 = getelementptr inbounds float* %tmp3381, i64 1
+  %tmp3383 = getelementptr inbounds float* %tmp3382, i64 1
+  %tmp3384 = getelementptr inbounds float* %tmp3383, i64 1
+  %tmp3385 = getelementptr inbounds float* %tmp3384, i64 1
+  %tmp3386 = getelementptr inbounds float* %tmp3385, i64 1
+  %tmp3387 = getelementptr inbounds float* %tmp3386, i64 1
+  %tmp3388 = getelementptr inbounds float* %tmp3387, i64 1
+  %tmp3389 = getelementptr inbounds float* %tmp3388, i64 1
+  %tmp3390 = getelementptr inbounds float* %tmp3389, i64 1
+  %tmp3391 = getelementptr inbounds float* %tmp3390, i64 1
+  %tmp3392 = getelementptr inbounds float* %tmp3391, i64 1
+  %tmp3393 = getelementptr inbounds float* %tmp3392, i64 1
+  %tmp3394 = getelementptr inbounds float* %tmp3393, i64 1
+  %tmp3395 = getelementptr inbounds float* %tmp3394, i64 1
+  %tmp3396 = getelementptr inbounds float* %tmp3395, i64 1
+  %tmp3397 = getelementptr inbounds float* %tmp3396, i64 1
+  %tmp3398 = getelementptr inbounds float* %tmp3397, i64 1
+  %tmp3399 = getelementptr inbounds float* %tmp3398, i64 1
+  %tmp3400 = getelementptr inbounds float* %tmp3399, i64 1
+  %tmp3401 = getelementptr inbounds float* %tmp3400, i64 1
+  %tmp3402 = getelementptr inbounds float* %tmp3401, i64 1
+  %tmp3403 = getelementptr inbounds float* %tmp3402, i64 1
+  %tmp3404 = getelementptr inbounds float* %tmp3403, i64 1
+  %tmp3405 = getelementptr inbounds float* %tmp3404, i64 1
+  %tmp3406 = getelementptr inbounds float* %tmp3405, i64 1
+  %tmp3407 = getelementptr inbounds float* %tmp3406, i64 1
+  %tmp3408 = getelementptr inbounds float* %tmp3407, i64 1
+  %tmp3409 = getelementptr inbounds float* %tmp3408, i64 1
+  %tmp3410 = getelementptr inbounds float* %tmp3409, i64 1
+  %tmp3411 = getelementptr inbounds float* %tmp3410, i64 1
+  %tmp3412 = getelementptr inbounds float* %tmp3411, i64 1
+  %tmp3413 = getelementptr inbounds float* %tmp3412, i64 1
+  %tmp3414 = getelementptr inbounds float* %tmp3413, i64 1
+  %tmp3415 = getelementptr inbounds float* %tmp3414, i64 1
+  %tmp3416 = getelementptr inbounds float* %tmp3415, i64 1
+  %tmp3417 = getelementptr inbounds float* %tmp3416, i64 1
+  %tmp3418 = getelementptr inbounds float* %tmp3417, i64 1
+  %tmp3419 = getelementptr inbounds float* %tmp3418, i64 1
+  %tmp3420 = getelementptr inbounds float* %tmp3419, i64 1
+  %tmp3421 = getelementptr inbounds float* %tmp3420, i64 1
+  %tmp3422 = getelementptr inbounds float* %tmp3421, i64 1
+  %tmp3423 = getelementptr inbounds float* %tmp3422, i64 1
+  %tmp3424 = getelementptr inbounds float* %tmp3423, i64 1
+  %tmp3425 = getelementptr inbounds float* %tmp3424, i64 1
+  %tmp3426 = getelementptr inbounds float* %tmp3425, i64 1
+  %tmp3427 = getelementptr inbounds float* %tmp3426, i64 1
+  %tmp3428 = getelementptr inbounds float* %tmp3427, i64 1
+  %tmp3429 = getelementptr inbounds float* %tmp3428, i64 1
+  %tmp3430 = getelementptr inbounds float* %tmp3429, i64 1
+  %tmp3431 = getelementptr inbounds float* %tmp3430, i64 1
+  %tmp3432 = getelementptr inbounds float* %tmp3431, i64 1
+  %tmp3433 = getelementptr inbounds float* %tmp3432, i64 1
+  %tmp3434 = getelementptr inbounds float* %tmp3433, i64 1
+  %tmp3435 = getelementptr inbounds float* %tmp3434, i64 1
+  %tmp3436 = getelementptr inbounds float* %tmp3435, i64 1
+  %tmp3437 = getelementptr inbounds float* %tmp3436, i64 1
+  %tmp3438 = getelementptr inbounds float* %tmp3437, i64 1
+  %tmp3439 = getelementptr inbounds float* %tmp3438, i64 1
+  %tmp3440 = getelementptr inbounds float* %tmp3439, i64 1
+  %tmp3441 = getelementptr inbounds float* %tmp3440, i64 1
+  %tmp3442 = getelementptr inbounds float* %tmp3441, i64 1
+  %tmp3443 = getelementptr inbounds float* %tmp3442, i64 1
+  %tmp3444 = getelementptr inbounds float* %tmp3443, i64 1
+  %tmp3445 = getelementptr inbounds float* %tmp3444, i64 1
+  %tmp3446 = getelementptr inbounds float* %tmp3445, i64 1
+  %tmp3447 = getelementptr inbounds float* %tmp3446, i64 1
+  %tmp3448 = getelementptr inbounds float* %tmp3447, i64 1
+  %tmp3449 = getelementptr inbounds float* %tmp3448, i64 1
+  %tmp3450 = getelementptr inbounds float* %tmp3449, i64 1
+  %tmp3451 = getelementptr inbounds float* %tmp3450, i64 1
+  %tmp3452 = getelementptr inbounds float* %tmp3451, i64 1
+  %tmp3453 = getelementptr inbounds float* %tmp3452, i64 1
+  %tmp3454 = getelementptr inbounds float* %tmp3453, i64 1
+  %tmp3455 = getelementptr inbounds float* %tmp3454, i64 1
+  %tmp3456 = getelementptr inbounds float* %tmp3455, i64 1
+  %tmp3457 = getelementptr inbounds float* %tmp3456, i64 1
+  %tmp3458 = getelementptr inbounds float* %tmp3457, i64 1
+  %tmp3459 = getelementptr inbounds float* %tmp3458, i64 1
+  %tmp3460 = getelementptr inbounds float* %tmp3459, i64 1
+  %tmp3461 = getelementptr inbounds float* %tmp3460, i64 1
+  %tmp3462 = getelementptr inbounds float* %tmp3461, i64 1
+  %tmp3463 = getelementptr inbounds float* %tmp3462, i64 1
+  %tmp3464 = getelementptr inbounds float* %tmp3463, i64 1
+  %tmp3465 = getelementptr inbounds float* %tmp3464, i64 1
+  %tmp3466 = getelementptr inbounds float* %tmp3465, i64 1
+  %tmp3467 = getelementptr inbounds float* %tmp3466, i64 1
+  %tmp3468 = getelementptr inbounds float* %tmp3467, i64 1
+  %tmp3469 = getelementptr inbounds float* %tmp3468, i64 1
+  %tmp3470 = getelementptr inbounds float* %tmp3469, i64 1
+  %tmp3471 = getelementptr inbounds float* %tmp3470, i64 1
+  %tmp3472 = getelementptr inbounds float* %tmp3471, i64 1
+  %tmp3473 = getelementptr inbounds float* %tmp3472, i64 1
+  %tmp3474 = getelementptr inbounds float* %tmp3473, i64 1
+  %tmp3475 = getelementptr inbounds float* %tmp3474, i64 1
+  %tmp3476 = getelementptr inbounds float* %tmp3475, i64 1
+  %tmp3477 = getelementptr inbounds float* %tmp3476, i64 1
+  %tmp3478 = getelementptr inbounds float* %tmp3477, i64 1
+  %tmp3479 = getelementptr inbounds float* %tmp3478, i64 1
+  %tmp3480 = getelementptr inbounds float* %tmp3479, i64 1
+  %tmp3481 = getelementptr inbounds float* %tmp3480, i64 1
+  %tmp3482 = getelementptr inbounds float* %tmp3481, i64 1
+  %tmp3483 = getelementptr inbounds float* %tmp3482, i64 1
+  %tmp3484 = getelementptr inbounds float* %tmp3483, i64 1
+  %tmp3485 = getelementptr inbounds float* %tmp3484, i64 1
+  %tmp3486 = getelementptr inbounds float* %tmp3485, i64 1
+  %tmp3487 = getelementptr inbounds float* %tmp3486, i64 1
+  %tmp3488 = getelementptr inbounds float* %tmp3487, i64 1
+  %tmp3489 = getelementptr inbounds float* %tmp3488, i64 1
+  %tmp3490 = getelementptr inbounds float* %tmp3489, i64 1
+  %tmp3491 = getelementptr inbounds float* %tmp3490, i64 1
+  %tmp3492 = getelementptr inbounds float* %tmp3491, i64 1
+  %tmp3493 = getelementptr inbounds float* %tmp3492, i64 1
+  %tmp3494 = getelementptr inbounds float* %tmp3493, i64 1
+  %tmp3495 = getelementptr inbounds float* %tmp3494, i64 1
+  %tmp3496 = getelementptr inbounds float* %tmp3495, i64 1
+  %tmp3497 = getelementptr inbounds float* %tmp3496, i64 1
+  %tmp3498 = getelementptr inbounds float* %tmp3497, i64 1
+  %tmp3499 = getelementptr inbounds float* %tmp3498, i64 1
+  %tmp3500 = getelementptr inbounds float* %tmp3499, i64 1
+  %tmp3501 = getelementptr inbounds float* %tmp3500, i64 1
+  %tmp3502 = getelementptr inbounds float* %tmp3501, i64 1
+  %tmp3503 = getelementptr inbounds float* %tmp3502, i64 1
+  %tmp3504 = getelementptr inbounds float* %tmp3503, i64 1
+  %tmp3505 = getelementptr inbounds float* %tmp3504, i64 1
+  %tmp3506 = getelementptr inbounds float* %tmp3505, i64 1
+  %tmp3507 = getelementptr inbounds float* %tmp3506, i64 1
+  %tmp3508 = getelementptr inbounds float* %tmp3507, i64 1
+  %tmp3509 = getelementptr inbounds float* %tmp3508, i64 1
+  %tmp3510 = getelementptr inbounds float* %tmp3509, i64 1
+  %tmp3511 = getelementptr inbounds float* %tmp3510, i64 1
+  %tmp3512 = getelementptr inbounds float* %tmp3511, i64 1
+  %tmp3513 = getelementptr inbounds float* %tmp3512, i64 1
+  %tmp3514 = getelementptr inbounds float* %tmp3513, i64 1
+  %tmp3515 = getelementptr inbounds float* %tmp3514, i64 1
+  %tmp3516 = getelementptr inbounds float* %tmp3515, i64 1
+  %tmp3517 = getelementptr inbounds float* %tmp3516, i64 1
+  %tmp3518 = getelementptr inbounds float* %tmp3517, i64 1
+  %tmp3519 = getelementptr inbounds float* %tmp3518, i64 1
+  %tmp3520 = getelementptr inbounds float* %tmp3519, i64 1
+  %tmp3521 = getelementptr inbounds float* %tmp3520, i64 1
+  %tmp3522 = getelementptr inbounds float* %tmp3521, i64 1
+  %tmp3523 = getelementptr inbounds float* %tmp3522, i64 1
+  %tmp3524 = getelementptr inbounds float* %tmp3523, i64 1
+  %tmp3525 = getelementptr inbounds float* %tmp3524, i64 1
+  %tmp3526 = getelementptr inbounds float* %tmp3525, i64 1
+  %tmp3527 = getelementptr inbounds float* %tmp3526, i64 1
+  %tmp3528 = getelementptr inbounds float* %tmp3527, i64 1
+  %tmp3529 = getelementptr inbounds float* %tmp3528, i64 1
+  %tmp3530 = getelementptr inbounds float* %tmp3529, i64 1
+  %tmp3531 = getelementptr inbounds float* %tmp3530, i64 1
+  %tmp3532 = getelementptr inbounds float* %tmp3531, i64 1
+  %tmp3533 = getelementptr inbounds float* %tmp3532, i64 1
+  %tmp3534 = getelementptr inbounds float* %tmp3533, i64 1
+  %tmp3535 = getelementptr inbounds float* %tmp3534, i64 1
+  %tmp3536 = getelementptr inbounds float* %tmp3535, i64 1
+  %tmp3537 = getelementptr inbounds float* %tmp3536, i64 1
+  %tmp3538 = getelementptr inbounds float* %tmp3537, i64 1
+  %tmp3539 = getelementptr inbounds float* %tmp3538, i64 1
+  %tmp3540 = getelementptr inbounds float* %tmp3539, i64 1
+  %tmp3541 = getelementptr inbounds float* %tmp3540, i64 1
+  %tmp3542 = getelementptr inbounds float* %tmp3541, i64 1
+  %tmp3543 = getelementptr inbounds float* %tmp3542, i64 1
+  %tmp3544 = getelementptr inbounds float* %tmp3543, i64 1
+  %tmp3545 = getelementptr inbounds float* %tmp3544, i64 1
+  %tmp3546 = getelementptr inbounds float* %tmp3545, i64 1
+  %tmp3547 = getelementptr inbounds float* %tmp3546, i64 1
+  %tmp3548 = getelementptr inbounds float* %tmp3547, i64 1
+  %tmp3549 = getelementptr inbounds float* %tmp3548, i64 1
+  %tmp3550 = getelementptr inbounds float* %tmp3549, i64 1
+  %tmp3551 = getelementptr inbounds float* %tmp3550, i64 1
+  %tmp3552 = getelementptr inbounds float* %tmp3551, i64 1
+  %tmp3553 = getelementptr inbounds float* %tmp3552, i64 1
+  %tmp3554 = getelementptr inbounds float* %tmp3553, i64 1
+  %tmp3555 = getelementptr inbounds float* %tmp3554, i64 1
+  %tmp3556 = getelementptr inbounds float* %tmp3555, i64 1
+  %tmp3557 = getelementptr inbounds float* %tmp3556, i64 1
+  %tmp3558 = getelementptr inbounds float* %tmp3557, i64 1
+  %tmp3559 = getelementptr inbounds float* %tmp3558, i64 1
+  %tmp3560 = getelementptr inbounds float* %tmp3559, i64 1
+  %tmp3561 = getelementptr inbounds float* %tmp3560, i64 1
+  %tmp3562 = getelementptr inbounds float* %tmp3561, i64 1
+  %tmp3563 = getelementptr inbounds float* %tmp3562, i64 1
+  %tmp3564 = getelementptr inbounds float* %tmp3563, i64 1
+  %tmp3565 = getelementptr inbounds float* %tmp3564, i64 1
+  %tmp3566 = getelementptr inbounds float* %tmp3565, i64 1
+  %tmp3567 = getelementptr inbounds float* %tmp3566, i64 1
+  %tmp3568 = getelementptr inbounds float* %tmp3567, i64 1
+  %tmp3569 = getelementptr inbounds float* %tmp3568, i64 1
+  %tmp3570 = getelementptr inbounds float* %tmp3569, i64 1
+  %tmp3571 = getelementptr inbounds float* %tmp3570, i64 1
+  %tmp3572 = getelementptr inbounds float* %tmp3571, i64 1
+  %tmp3573 = getelementptr inbounds float* %tmp3572, i64 1
+  %tmp3574 = getelementptr inbounds float* %tmp3573, i64 1
+  %tmp3575 = getelementptr inbounds float* %tmp3574, i64 1
+  %tmp3576 = getelementptr inbounds float* %tmp3575, i64 1
+  %tmp3577 = getelementptr inbounds float* %tmp3576, i64 1
+  %tmp3578 = getelementptr inbounds float* %tmp3577, i64 1
+  %tmp3579 = getelementptr inbounds float* %tmp3578, i64 1
+  %tmp3580 = getelementptr inbounds float* %tmp3579, i64 1
+  %tmp3581 = getelementptr inbounds float* %tmp3580, i64 1
+  %tmp3582 = getelementptr inbounds float* %tmp3581, i64 1
+  %tmp3583 = getelementptr inbounds float* %tmp3582, i64 1
+  %tmp3584 = getelementptr inbounds float* %tmp3583, i64 1
+  %tmp3585 = getelementptr inbounds float* %tmp3584, i64 1
+  %tmp3586 = getelementptr inbounds float* %tmp3585, i64 1
+  %tmp3587 = getelementptr inbounds float* %tmp3586, i64 1
+  %tmp3588 = getelementptr inbounds float* %tmp3587, i64 1
+  %tmp3589 = getelementptr inbounds float* %tmp3588, i64 1
+  %tmp3590 = getelementptr inbounds float* %tmp3589, i64 1
+  %tmp3591 = getelementptr inbounds float* %tmp3590, i64 1
+  %tmp3592 = getelementptr inbounds float* %tmp3591, i64 1
+  %tmp3593 = getelementptr inbounds float* %tmp3592, i64 1
+  %tmp3594 = getelementptr inbounds float* %tmp3593, i64 1
+  %tmp3595 = getelementptr inbounds float* %tmp3594, i64 1
+  %tmp3596 = getelementptr inbounds float* %tmp3595, i64 1
+  %tmp3597 = getelementptr inbounds float* %tmp3596, i64 1
+  %tmp3598 = getelementptr inbounds float* %tmp3597, i64 1
+  %tmp3599 = getelementptr inbounds float* %tmp3598, i64 1
+  %tmp3600 = getelementptr inbounds float* %tmp3599, i64 1
+  %tmp3601 = getelementptr inbounds float* %tmp3600, i64 1
+  %tmp3602 = getelementptr inbounds float* %tmp3601, i64 1
+  %tmp3603 = getelementptr inbounds float* %tmp3602, i64 1
+  %tmp3604 = getelementptr inbounds float* %tmp3603, i64 1
+  %tmp3605 = getelementptr inbounds float* %tmp3604, i64 1
+  %tmp3606 = getelementptr inbounds float* %tmp3605, i64 1
+  %tmp3607 = getelementptr inbounds float* %tmp3606, i64 1
+  %tmp3608 = getelementptr inbounds float* %tmp3607, i64 1
+  %tmp3609 = getelementptr inbounds float* %tmp3608, i64 1
+  %tmp3610 = getelementptr inbounds float* %tmp3609, i64 1
+  %tmp3611 = getelementptr inbounds float* %tmp3610, i64 1
+  %tmp3612 = getelementptr inbounds float* %tmp3611, i64 1
+  %tmp3613 = getelementptr inbounds float* %tmp3612, i64 1
+  %tmp3614 = getelementptr inbounds float* %tmp3613, i64 1
+  %tmp3615 = getelementptr inbounds float* %tmp3614, i64 1
+  %tmp3616 = getelementptr inbounds float* %tmp3615, i64 1
+  %tmp3617 = getelementptr inbounds float* %tmp3616, i64 1
+  %tmp3618 = getelementptr inbounds float* %tmp3617, i64 1
+  %tmp3619 = getelementptr inbounds float* %tmp3618, i64 1
+  %tmp3620 = getelementptr inbounds float* %tmp3619, i64 1
+  %tmp3621 = getelementptr inbounds float* %tmp3620, i64 1
+  %tmp3622 = getelementptr inbounds float* %tmp3621, i64 1
+  %tmp3623 = getelementptr inbounds float* %tmp3622, i64 1
+  %tmp3624 = getelementptr inbounds float* %tmp3623, i64 1
+  %tmp3625 = getelementptr inbounds float* %tmp3624, i64 1
+  %tmp3626 = getelementptr inbounds float* %tmp3625, i64 1
+  %tmp3627 = getelementptr inbounds float* %tmp3626, i64 1
+  %tmp3628 = getelementptr inbounds float* %tmp3627, i64 1
+  %tmp3629 = getelementptr inbounds float* %tmp3628, i64 1
+  %tmp3630 = getelementptr inbounds float* %tmp3629, i64 1
+  %tmp3631 = getelementptr inbounds float* %tmp3630, i64 1
+  %tmp3632 = getelementptr inbounds float* %tmp3631, i64 1
+  %tmp3633 = getelementptr inbounds float* %tmp3632, i64 1
+  %tmp3634 = getelementptr inbounds float* %tmp3633, i64 1
+  %tmp3635 = getelementptr inbounds float* %tmp3634, i64 1
+  %tmp3636 = getelementptr inbounds float* %tmp3635, i64 1
+  %tmp3637 = getelementptr inbounds float* %tmp3636, i64 1
+  %tmp3638 = getelementptr inbounds float* %tmp3637, i64 1
+  %tmp3639 = getelementptr inbounds float* %tmp3638, i64 1
+  %tmp3640 = getelementptr inbounds float* %tmp3639, i64 1
+  %tmp3641 = getelementptr inbounds float* %tmp3640, i64 1
+  %tmp3642 = getelementptr inbounds float* %tmp3641, i64 1
+  %tmp3643 = getelementptr inbounds float* %tmp3642, i64 1
+  %tmp3644 = getelementptr inbounds float* %tmp3643, i64 1
+  %tmp3645 = getelementptr inbounds float* %tmp3644, i64 1
+  %tmp3646 = getelementptr inbounds float* %tmp3645, i64 1
+  %tmp3647 = getelementptr inbounds float* %tmp3646, i64 1
+  %tmp3648 = getelementptr inbounds float* %tmp3647, i64 1
+  %tmp3649 = getelementptr inbounds float* %tmp3648, i64 1
+  %tmp3650 = getelementptr inbounds float* %tmp3649, i64 1
+  %tmp3651 = getelementptr inbounds float* %tmp3650, i64 1
+  %tmp3652 = getelementptr inbounds float* %tmp3651, i64 1
+  %tmp3653 = getelementptr inbounds float* %tmp3652, i64 1
+  %tmp3654 = getelementptr inbounds float* %tmp3653, i64 1
+  %tmp3655 = getelementptr inbounds float* %tmp3654, i64 1
+  %tmp3656 = getelementptr inbounds float* %tmp3655, i64 1
+  %tmp3657 = getelementptr inbounds float* %tmp3656, i64 1
+  %tmp3658 = getelementptr inbounds float* %tmp3657, i64 1
+  %tmp3659 = getelementptr inbounds float* %tmp3658, i64 1
+  %tmp3660 = getelementptr inbounds float* %tmp3659, i64 1
+  %tmp3661 = getelementptr inbounds float* %tmp3660, i64 1
+  %tmp3662 = getelementptr inbounds float* %tmp3661, i64 1
+  %tmp3663 = getelementptr inbounds float* %tmp3662, i64 1
+  %tmp3664 = getelementptr inbounds float* %tmp3663, i64 1
+  %tmp3665 = getelementptr inbounds float* %tmp3664, i64 1
+  %tmp3666 = getelementptr inbounds float* %tmp3665, i64 1
+  %tmp3667 = getelementptr inbounds float* %tmp3666, i64 1
+  %tmp3668 = getelementptr inbounds float* %tmp3667, i64 1
+  %tmp3669 = getelementptr inbounds float* %tmp3668, i64 1
+  %tmp3670 = getelementptr inbounds float* %tmp3669, i64 1
+  %tmp3671 = getelementptr inbounds float* %tmp3670, i64 1
+  %tmp3672 = getelementptr inbounds float* %tmp3671, i64 1
+  %tmp3673 = getelementptr inbounds float* %tmp3672, i64 1
+  %tmp3674 = getelementptr inbounds float* %tmp3673, i64 1
+  %tmp3675 = getelementptr inbounds float* %tmp3674, i64 1
+  %tmp3676 = getelementptr inbounds float* %tmp3675, i64 1
+  %tmp3677 = getelementptr inbounds float* %tmp3676, i64 1
+  %tmp3678 = getelementptr inbounds float* %tmp3677, i64 1
+  %tmp3679 = getelementptr inbounds float* %tmp3678, i64 1
+  %tmp3680 = getelementptr inbounds float* %tmp3679, i64 1
+  %tmp3681 = getelementptr inbounds float* %tmp3680, i64 1
+  %tmp3682 = getelementptr inbounds float* %tmp3681, i64 1
+  %tmp3683 = getelementptr inbounds float* %tmp3682, i64 1
+  %tmp3684 = getelementptr inbounds float* %tmp3683, i64 1
+  %tmp3685 = getelementptr inbounds float* %tmp3684, i64 1
+  %tmp3686 = getelementptr inbounds float* %tmp3685, i64 1
+  %tmp3687 = getelementptr inbounds float* %tmp3686, i64 1
+  %tmp3688 = getelementptr inbounds float* %tmp3687, i64 1
+  %tmp3689 = getelementptr inbounds float* %tmp3688, i64 1
+  %tmp3690 = getelementptr inbounds float* %tmp3689, i64 1
+  %tmp3691 = getelementptr inbounds float* %tmp3690, i64 1
+  %tmp3692 = getelementptr inbounds float* %tmp3691, i64 1
+  %tmp3693 = getelementptr inbounds float* %tmp3692, i64 1
+  %tmp3694 = getelementptr inbounds float* %tmp3693, i64 1
+  %tmp3695 = getelementptr inbounds float* %tmp3694, i64 1
+  %tmp3696 = getelementptr inbounds float* %tmp3695, i64 1
+  %tmp3697 = getelementptr inbounds float* %tmp3696, i64 1
+  %tmp3698 = getelementptr inbounds float* %tmp3697, i64 1
+  %tmp3699 = getelementptr inbounds float* %tmp3698, i64 1
+  %tmp3700 = getelementptr inbounds float* %tmp3699, i64 1
+  %tmp3701 = getelementptr inbounds float* %tmp3700, i64 1
+  %tmp3702 = getelementptr inbounds float* %tmp3701, i64 1
+  %tmp3703 = getelementptr inbounds float* %tmp3702, i64 1
+  %tmp3704 = getelementptr inbounds float* %tmp3703, i64 1
+  %tmp3705 = getelementptr inbounds float* %tmp3704, i64 1
+  %tmp3706 = getelementptr inbounds float* %tmp3705, i64 1
+  %tmp3707 = getelementptr inbounds float* %tmp3706, i64 1
+  %tmp3708 = getelementptr inbounds float* %tmp3707, i64 1
+  %tmp3709 = getelementptr inbounds float* %tmp3708, i64 1
+  %tmp3710 = getelementptr inbounds float* %tmp3709, i64 1
+  %tmp3711 = getelementptr inbounds float* %tmp3710, i64 1
+  %tmp3712 = getelementptr inbounds float* %tmp3711, i64 1
+  %tmp3713 = getelementptr inbounds float* %tmp3712, i64 1
+  %tmp3714 = getelementptr inbounds float* %tmp3713, i64 1
+  %tmp3715 = getelementptr inbounds float* %tmp3714, i64 1
+  %tmp3716 = getelementptr inbounds float* %tmp3715, i64 1
+  %tmp3717 = getelementptr inbounds float* %tmp3716, i64 1
+  %tmp3718 = getelementptr inbounds float* %tmp3717, i64 1
+  %tmp3719 = getelementptr inbounds float* %tmp3718, i64 1
+  %tmp3720 = getelementptr inbounds float* %tmp3719, i64 1
+  %tmp3721 = getelementptr inbounds float* %tmp3720, i64 1
+  %tmp3722 = getelementptr inbounds float* %tmp3721, i64 1
+  %tmp3723 = getelementptr inbounds float* %tmp3722, i64 1
+  %tmp3724 = getelementptr inbounds float* %tmp3723, i64 1
+  %tmp3725 = getelementptr inbounds float* %tmp3724, i64 1
+  %tmp3726 = getelementptr inbounds float* %tmp3725, i64 1
+  %tmp3727 = getelementptr inbounds float* %tmp3726, i64 1
+  %tmp3728 = getelementptr inbounds float* %tmp3727, i64 1
+  %tmp3729 = getelementptr inbounds float* %tmp3728, i64 1
+  %tmp3730 = getelementptr inbounds float* %tmp3729, i64 1
+  %tmp3731 = getelementptr inbounds float* %tmp3730, i64 1
+  %tmp3732 = getelementptr inbounds float* %tmp3731, i64 1
+  %tmp3733 = getelementptr inbounds float* %tmp3732, i64 1
+  %tmp3734 = getelementptr inbounds float* %tmp3733, i64 1
+  %tmp3735 = getelementptr inbounds float* %tmp3734, i64 1
+  %tmp3736 = getelementptr inbounds float* %tmp3735, i64 1
+  %tmp3737 = getelementptr inbounds float* %tmp3736, i64 1
+  %tmp3738 = getelementptr inbounds float* %tmp3737, i64 1
+  %tmp3739 = getelementptr inbounds float* %tmp3738, i64 1
+  %tmp3740 = getelementptr inbounds float* %tmp3739, i64 1
+  %tmp3741 = getelementptr inbounds float* %tmp3740, i64 1
+  %tmp3742 = getelementptr inbounds float* %tmp3741, i64 1
+  %tmp3743 = getelementptr inbounds float* %tmp3742, i64 1
+  %tmp3744 = getelementptr inbounds float* %tmp3743, i64 1
+  %tmp3745 = getelementptr inbounds float* %tmp3744, i64 1
+  %tmp3746 = getelementptr inbounds float* %tmp3745, i64 1
+  %tmp3747 = getelementptr inbounds float* %tmp3746, i64 1
+  %tmp3748 = getelementptr inbounds float* %tmp3747, i64 1
+  %tmp3749 = getelementptr inbounds float* %tmp3748, i64 1
+  %tmp3750 = getelementptr inbounds float* %tmp3749, i64 1
+  %tmp3751 = getelementptr inbounds float* %tmp3750, i64 1
+  %tmp3752 = getelementptr inbounds float* %tmp3751, i64 1
+  %tmp3753 = getelementptr inbounds float* %tmp3752, i64 1
+  %tmp3754 = getelementptr inbounds float* %tmp3753, i64 1
+  %tmp3755 = getelementptr inbounds float* %tmp3754, i64 1
+  %tmp3756 = getelementptr inbounds float* %tmp3755, i64 1
+  %tmp3757 = getelementptr inbounds float* %tmp3756, i64 1
+  %tmp3758 = getelementptr inbounds float* %tmp3757, i64 1
+  %tmp3759 = getelementptr inbounds float* %tmp3758, i64 1
+  %tmp3760 = getelementptr inbounds float* %tmp3759, i64 1
+  %tmp3761 = getelementptr inbounds float* %tmp3760, i64 1
+  %tmp3762 = getelementptr inbounds float* %tmp3761, i64 1
+  %tmp3763 = getelementptr inbounds float* %tmp3762, i64 1
+  %tmp3764 = getelementptr inbounds float* %tmp3763, i64 1
+  %tmp3765 = getelementptr inbounds float* %tmp3764, i64 1
+  %tmp3766 = getelementptr inbounds float* %tmp3765, i64 1
+  %tmp3767 = getelementptr inbounds float* %tmp3766, i64 1
+  %tmp3768 = getelementptr inbounds float* %tmp3767, i64 1
+  %tmp3769 = getelementptr inbounds float* %tmp3768, i64 1
+  %tmp3770 = getelementptr inbounds float* %tmp3769, i64 1
+  %tmp3771 = getelementptr inbounds float* %tmp3770, i64 1
+  %tmp3772 = getelementptr inbounds float* %tmp3771, i64 1
+  %tmp3773 = getelementptr inbounds float* %tmp3772, i64 1
+  %tmp3774 = getelementptr inbounds float* %tmp3773, i64 1
+  %tmp3775 = getelementptr inbounds float* %tmp3774, i64 1
+  %tmp3776 = getelementptr inbounds float* %tmp3775, i64 1
+  %tmp3777 = getelementptr inbounds float* %tmp3776, i64 1
+  %tmp3778 = getelementptr inbounds float* %tmp3777, i64 1
+  %tmp3779 = getelementptr inbounds float* %tmp3778, i64 1
+  %tmp3780 = getelementptr inbounds float* %tmp3779, i64 1
+  %tmp3781 = getelementptr inbounds float* %tmp3780, i64 1
+  %tmp3782 = getelementptr inbounds float* %tmp3781, i64 1
+  %tmp3783 = getelementptr inbounds float* %tmp3782, i64 1
+  %tmp3784 = getelementptr inbounds float* %tmp3783, i64 1
+  %tmp3785 = getelementptr inbounds float* %tmp3784, i64 1
+  %tmp3786 = getelementptr inbounds float* %tmp3785, i64 1
+  %tmp3787 = getelementptr inbounds float* %tmp3786, i64 1
+  %tmp3788 = getelementptr inbounds float* %tmp3787, i64 1
+  %tmp3789 = getelementptr inbounds float* %tmp3788, i64 1
+  %tmp3790 = getelementptr inbounds float* %tmp3789, i64 1
+  %tmp3791 = getelementptr inbounds float* %tmp3790, i64 1
+  %tmp3792 = getelementptr inbounds float* %tmp3791, i64 1
+  %tmp3793 = getelementptr inbounds float* %tmp3792, i64 1
+  %tmp3794 = getelementptr inbounds float* %tmp3793, i64 1
+  %tmp3795 = getelementptr inbounds float* %tmp3794, i64 1
+  %tmp3796 = getelementptr inbounds float* %tmp3795, i64 1
+  %tmp3797 = getelementptr inbounds float* %tmp3796, i64 1
+  %tmp3798 = getelementptr inbounds float* %tmp3797, i64 1
+  %tmp3799 = getelementptr inbounds float* %tmp3798, i64 1
+  %tmp3800 = getelementptr inbounds float* %tmp3799, i64 1
+  %tmp3801 = getelementptr inbounds float* %tmp3800, i64 1
+  %tmp3802 = getelementptr inbounds float* %tmp3801, i64 1
+  %tmp3803 = getelementptr inbounds float* %tmp3802, i64 1
+  %tmp3804 = getelementptr inbounds float* %tmp3803, i64 1
+  %tmp3805 = getelementptr inbounds float* %tmp3804, i64 1
+  %tmp3806 = getelementptr inbounds float* %tmp3805, i64 1
+  %tmp3807 = getelementptr inbounds float* %tmp3806, i64 1
+  %tmp3808 = getelementptr inbounds float* %tmp3807, i64 1
+  %tmp3809 = getelementptr inbounds float* %tmp3808, i64 1
+  %tmp3810 = getelementptr inbounds float* %tmp3809, i64 1
+  %tmp3811 = getelementptr inbounds float* %tmp3810, i64 1
+  %tmp3812 = getelementptr inbounds float* %tmp3811, i64 1
+  %tmp3813 = getelementptr inbounds float* %tmp3812, i64 1
+  %tmp3814 = getelementptr inbounds float* %tmp3813, i64 1
+  %tmp3815 = getelementptr inbounds float* %tmp3814, i64 1
+  %tmp3816 = getelementptr inbounds float* %tmp3815, i64 1
+  %tmp3817 = getelementptr inbounds float* %tmp3816, i64 1
+  %tmp3818 = getelementptr inbounds float* %tmp3817, i64 1
+  %tmp3819 = getelementptr inbounds float* %tmp3818, i64 1
+  %tmp3820 = getelementptr inbounds float* %tmp3819, i64 1
+  %tmp3821 = getelementptr inbounds float* %tmp3820, i64 1
+  %tmp3822 = getelementptr inbounds float* %tmp3821, i64 1
+  %tmp3823 = getelementptr inbounds float* %tmp3822, i64 1
+  %tmp3824 = getelementptr inbounds float* %tmp3823, i64 1
+  %tmp3825 = getelementptr inbounds float* %tmp3824, i64 1
+  %tmp3826 = getelementptr inbounds float* %tmp3825, i64 1
+  %tmp3827 = getelementptr inbounds float* %tmp3826, i64 1
+  %tmp3828 = getelementptr inbounds float* %tmp3827, i64 1
+  %tmp3829 = getelementptr inbounds float* %tmp3828, i64 1
+  %tmp3830 = getelementptr inbounds float* %tmp3829, i64 1
+  %tmp3831 = getelementptr inbounds float* %tmp3830, i64 1
+  %tmp3832 = getelementptr inbounds float* %tmp3831, i64 1
+  %tmp3833 = getelementptr inbounds float* %tmp3832, i64 1
+  %tmp3834 = getelementptr inbounds float* %tmp3833, i64 1
+  %tmp3835 = getelementptr inbounds float* %tmp3834, i64 1
+  %tmp3836 = getelementptr inbounds float* %tmp3835, i64 1
+  %tmp3837 = getelementptr inbounds float* %tmp3836, i64 1
+  %tmp3838 = getelementptr inbounds float* %tmp3837, i64 1
+  %tmp3839 = getelementptr inbounds float* %tmp3838, i64 1
+  %tmp3840 = getelementptr inbounds float* %tmp3839, i64 1
+  %tmp3841 = getelementptr inbounds float* %tmp3840, i64 1
+  %tmp3842 = getelementptr inbounds float* %tmp3841, i64 1
+  %tmp3843 = getelementptr inbounds float* %tmp3842, i64 1
+  %tmp3844 = getelementptr inbounds float* %tmp3843, i64 1
+  %tmp3845 = getelementptr inbounds float* %tmp3844, i64 1
+  %tmp3846 = getelementptr inbounds float* %tmp3845, i64 1
+  %tmp3847 = getelementptr inbounds float* %tmp3846, i64 1
+  %tmp3848 = getelementptr inbounds float* %tmp3847, i64 1
+  %tmp3849 = getelementptr inbounds float* %tmp3848, i64 1
+  %tmp3850 = getelementptr inbounds float* %tmp3849, i64 1
+  %tmp3851 = getelementptr inbounds float* %tmp3850, i64 1
+  %tmp3852 = getelementptr inbounds float* %tmp3851, i64 1
+  %tmp3853 = getelementptr inbounds float* %tmp3852, i64 1
+  %tmp3854 = getelementptr inbounds float* %tmp3853, i64 1
+  %tmp3855 = getelementptr inbounds float* %tmp3854, i64 1
+  %tmp3856 = getelementptr inbounds float* %tmp3855, i64 1
+  %tmp3857 = getelementptr inbounds float* %tmp3856, i64 1
+  %tmp3858 = getelementptr inbounds float* %tmp3857, i64 1
+  %tmp3859 = getelementptr inbounds float* %tmp3858, i64 1
+  %tmp3860 = getelementptr inbounds float* %tmp3859, i64 1
+  %tmp3861 = getelementptr inbounds float* %tmp3860, i64 1
+  %tmp3862 = getelementptr inbounds float* %tmp3861, i64 1
+  %tmp3863 = getelementptr inbounds float* %tmp3862, i64 1
+  %tmp3864 = getelementptr inbounds float* %tmp3863, i64 1
+  %tmp3865 = getelementptr inbounds float* %tmp3864, i64 1
+  %tmp3866 = getelementptr inbounds float* %tmp3865, i64 1
+  %tmp3867 = getelementptr inbounds float* %tmp3866, i64 1
+  %tmp3868 = getelementptr inbounds float* %tmp3867, i64 1
+  %tmp3869 = getelementptr inbounds float* %tmp3868, i64 1
+  %tmp3870 = getelementptr inbounds float* %tmp3869, i64 1
+  %tmp3871 = getelementptr inbounds float* %tmp3870, i64 1
+  %tmp3872 = getelementptr inbounds float* %tmp3871, i64 1
+  %tmp3873 = getelementptr inbounds float* %tmp3872, i64 1
+  %tmp3874 = getelementptr inbounds float* %tmp3873, i64 1
+  %tmp3875 = getelementptr inbounds float* %tmp3874, i64 1
+  %tmp3876 = getelementptr inbounds float* %tmp3875, i64 1
+  %tmp3877 = getelementptr inbounds float* %tmp3876, i64 1
+  %tmp3878 = getelementptr inbounds float* %tmp3877, i64 1
+  %tmp3879 = getelementptr inbounds float* %tmp3878, i64 1
+  %tmp3880 = getelementptr inbounds float* %tmp3879, i64 1
+  %tmp3881 = getelementptr inbounds float* %tmp3880, i64 1
+  %tmp3882 = getelementptr inbounds float* %tmp3881, i64 1
+  %tmp3883 = getelementptr inbounds float* %tmp3882, i64 1
+  %tmp3884 = getelementptr inbounds float* %tmp3883, i64 1
+  %tmp3885 = getelementptr inbounds float* %tmp3884, i64 1
+  %tmp3886 = getelementptr inbounds float* %tmp3885, i64 1
+  %tmp3887 = getelementptr inbounds float* %tmp3886, i64 1
+  %tmp3888 = getelementptr inbounds float* %tmp3887, i64 1
+  %tmp3889 = getelementptr inbounds float* %tmp3888, i64 1
+  %tmp3890 = getelementptr inbounds float* %tmp3889, i64 1
+  %tmp3891 = getelementptr inbounds float* %tmp3890, i64 1
+  %tmp3892 = getelementptr inbounds float* %tmp3891, i64 1
+  %tmp3893 = getelementptr inbounds float* %tmp3892, i64 1
+  %tmp3894 = getelementptr inbounds float* %tmp3893, i64 1
+  %tmp3895 = getelementptr inbounds float* %tmp3894, i64 1
+  %tmp3896 = getelementptr inbounds float* %tmp3895, i64 1
+  %tmp3897 = getelementptr inbounds float* %tmp3896, i64 1
+  %tmp3898 = getelementptr inbounds float* %tmp3897, i64 1
+  %tmp3899 = getelementptr inbounds float* %tmp3898, i64 1
+  %tmp3900 = getelementptr inbounds float* %tmp3899, i64 1
+  %tmp3901 = getelementptr inbounds float* %tmp3900, i64 1
+  %tmp3902 = getelementptr inbounds float* %tmp3901, i64 1
+  %tmp3903 = getelementptr inbounds float* %tmp3902, i64 1
+  %tmp3904 = getelementptr inbounds float* %tmp3903, i64 1
+  %tmp3905 = getelementptr inbounds float* %tmp3904, i64 1
+  %tmp3906 = getelementptr inbounds float* %tmp3905, i64 1
+  %tmp3907 = getelementptr inbounds float* %tmp3906, i64 1
+  %tmp3908 = getelementptr inbounds float* %tmp3907, i64 1
+  %tmp3909 = getelementptr inbounds float* %tmp3908, i64 1
+  %tmp3910 = getelementptr inbounds float* %tmp3909, i64 1
+  %tmp3911 = getelementptr inbounds float* %tmp3910, i64 1
+  %tmp3912 = getelementptr inbounds float* %tmp3911, i64 1
+  %tmp3913 = getelementptr inbounds float* %tmp3912, i64 1
+  %tmp3914 = getelementptr inbounds float* %tmp3913, i64 1
+  %tmp3915 = getelementptr inbounds float* %tmp3914, i64 1
+  %tmp3916 = getelementptr inbounds float* %tmp3915, i64 1
+  %tmp3917 = getelementptr inbounds float* %tmp3916, i64 1
+  %tmp3918 = getelementptr inbounds float* %tmp3917, i64 1
+  %tmp3919 = getelementptr inbounds float* %tmp3918, i64 1
+  %tmp3920 = getelementptr inbounds float* %tmp3919, i64 1
+  %tmp3921 = getelementptr inbounds float* %tmp3920, i64 1
+  %tmp3922 = getelementptr inbounds float* %tmp3921, i64 1
+  %tmp3923 = getelementptr inbounds float* %tmp3922, i64 1
+  %tmp3924 = getelementptr inbounds float* %tmp3923, i64 1
+  %tmp3925 = getelementptr inbounds float* %tmp3924, i64 1
+  %tmp3926 = getelementptr inbounds float* %tmp3925, i64 1
+  %tmp3927 = getelementptr inbounds float* %tmp3926, i64 1
+  %tmp3928 = getelementptr inbounds float* %tmp3927, i64 1
+  %tmp3929 = getelementptr inbounds float* %tmp3928, i64 1
+  %tmp3930 = getelementptr inbounds float* %tmp3929, i64 1
+  %tmp3931 = getelementptr inbounds float* %tmp3930, i64 1
+  %tmp3932 = getelementptr inbounds float* %tmp3931, i64 1
+  %tmp3933 = getelementptr inbounds float* %tmp3932, i64 1
+  %tmp3934 = getelementptr inbounds float* %tmp3933, i64 1
+  %tmp3935 = getelementptr inbounds float* %tmp3934, i64 1
+  %tmp3936 = getelementptr inbounds float* %tmp3935, i64 1
+  %tmp3937 = getelementptr inbounds float* %tmp3936, i64 1
+  %tmp3938 = getelementptr inbounds float* %tmp3937, i64 1
+  %tmp3939 = getelementptr inbounds float* %tmp3938, i64 1
+  %tmp3940 = getelementptr inbounds float* %tmp3939, i64 1
+  %tmp3941 = getelementptr inbounds float* %tmp3940, i64 1
+  %tmp3942 = getelementptr inbounds float* %tmp3941, i64 1
+  %tmp3943 = getelementptr inbounds float* %tmp3942, i64 1
+  %tmp3944 = getelementptr inbounds float* %tmp3943, i64 1
+  %tmp3945 = getelementptr inbounds float* %tmp3944, i64 1
+  %tmp3946 = getelementptr inbounds float* %tmp3945, i64 1
+  %tmp3947 = getelementptr inbounds float* %tmp3946, i64 1
+  %tmp3948 = getelementptr inbounds float* %tmp3947, i64 1
+  %tmp3949 = getelementptr inbounds float* %tmp3948, i64 1
+  %tmp3950 = getelementptr inbounds float* %tmp3949, i64 1
+  %tmp3951 = getelementptr inbounds float* %tmp3950, i64 1
+  %tmp3952 = getelementptr inbounds float* %tmp3951, i64 1
+  %tmp3953 = getelementptr inbounds float* %tmp3952, i64 1
+  %tmp3954 = getelementptr inbounds float* %tmp3953, i64 1
+  %tmp3955 = getelementptr inbounds float* %tmp3954, i64 1
+  %tmp3956 = getelementptr inbounds float* %tmp3955, i64 1
+  %tmp3957 = getelementptr inbounds float* %tmp3956, i64 1
+  %tmp3958 = getelementptr inbounds float* %tmp3957, i64 1
+  %tmp3959 = getelementptr inbounds float* %tmp3958, i64 1
+  %tmp3960 = getelementptr inbounds float* %tmp3959, i64 1
+  %tmp3961 = getelementptr inbounds float* %tmp3960, i64 1
+  %tmp3962 = getelementptr inbounds float* %tmp3961, i64 1
+  %tmp3963 = getelementptr inbounds float* %tmp3962, i64 1
+  %tmp3964 = getelementptr inbounds float* %tmp3963, i64 1
+  %tmp3965 = getelementptr inbounds float* %tmp3964, i64 1
+  %tmp3966 = getelementptr inbounds float* %tmp3965, i64 1
+  %tmp3967 = getelementptr inbounds float* %tmp3966, i64 1
+  %tmp3968 = getelementptr inbounds float* %tmp3967, i64 1
+  %tmp3969 = getelementptr inbounds float* %tmp3968, i64 1
+  %tmp3970 = getelementptr inbounds float* %tmp3969, i64 1
+  %tmp3971 = getelementptr inbounds float* %tmp3970, i64 1
+  %tmp3972 = getelementptr inbounds float* %tmp3971, i64 1
+  %tmp3973 = getelementptr inbounds float* %tmp3972, i64 1
+  %tmp3974 = getelementptr inbounds float* %tmp3973, i64 1
+  %tmp3975 = getelementptr inbounds float* %tmp3974, i64 1
+  %tmp3976 = getelementptr inbounds float* %tmp3975, i64 1
+  %tmp3977 = getelementptr inbounds float* %tmp3976, i64 1
+  %tmp3978 = getelementptr inbounds float* %tmp3977, i64 1
+  %tmp3979 = getelementptr inbounds float* %tmp3978, i64 1
+  %tmp3980 = getelementptr inbounds float* %tmp3979, i64 1
+  %tmp3981 = getelementptr inbounds float* %tmp3980, i64 1
+  %tmp3982 = getelementptr inbounds float* %tmp3981, i64 1
+  %tmp3983 = getelementptr inbounds float* %tmp3982, i64 1
+  %tmp3984 = getelementptr inbounds float* %tmp3983, i64 1
+  %tmp3985 = getelementptr inbounds float* %tmp3984, i64 1
+  %tmp3986 = getelementptr inbounds float* %tmp3985, i64 1
+  %tmp3987 = getelementptr inbounds float* %tmp3986, i64 1
+  %tmp3988 = getelementptr inbounds float* %tmp3987, i64 1
+  %tmp3989 = getelementptr inbounds float* %tmp3988, i64 1
+  %tmp3990 = getelementptr inbounds float* %tmp3989, i64 1
+  %tmp3991 = getelementptr inbounds float* %tmp3990, i64 1
+  %tmp3992 = getelementptr inbounds float* %tmp3991, i64 1
+  %tmp3993 = getelementptr inbounds float* %tmp3992, i64 1
+  %tmp3994 = getelementptr inbounds float* %tmp3993, i64 1
+  %tmp3995 = getelementptr inbounds float* %tmp3994, i64 1
+  %tmp3996 = getelementptr inbounds float* %tmp3995, i64 1
+  %tmp3997 = getelementptr inbounds float* %tmp3996, i64 1
+  %tmp3998 = getelementptr inbounds float* %tmp3997, i64 1
+  %tmp3999 = getelementptr inbounds float* %tmp3998, i64 1
+  %tmp4000 = getelementptr inbounds float* %tmp3999, i64 1
+  %tmp4001 = getelementptr inbounds float* %tmp4000, i64 1
+  %tmp4002 = getelementptr inbounds float* %tmp4001, i64 1
+  %tmp4003 = getelementptr inbounds float* %tmp4002, i64 1
+  %tmp4004 = getelementptr inbounds float* %tmp4003, i64 1
+  %tmp4005 = getelementptr inbounds float* %tmp4004, i64 1
+  %tmp4006 = getelementptr inbounds float* %tmp4005, i64 1
+  %tmp4007 = getelementptr inbounds float* %tmp4006, i64 1
+  %tmp4008 = getelementptr inbounds float* %tmp4007, i64 1
+  %tmp4009 = getelementptr inbounds float* %tmp4008, i64 1
+  %tmp4010 = getelementptr inbounds float* %tmp4009, i64 1
+  %tmp4011 = getelementptr inbounds float* %tmp4010, i64 1
+  %tmp4012 = getelementptr inbounds float* %tmp4011, i64 1
+  %tmp4013 = getelementptr inbounds float* %tmp4012, i64 1
+  %tmp4014 = getelementptr inbounds float* %tmp4013, i64 1
+  %tmp4015 = getelementptr inbounds float* %tmp4014, i64 1
+  %tmp4016 = getelementptr inbounds float* %tmp4015, i64 1
+  %tmp4017 = getelementptr inbounds float* %tmp4016, i64 1
+  %tmp4018 = getelementptr inbounds float* %tmp4017, i64 1
+  %tmp4019 = getelementptr inbounds float* %tmp4018, i64 1
+  %tmp4020 = getelementptr inbounds float* %tmp4019, i64 1
+  %tmp4021 = getelementptr inbounds float* %tmp4020, i64 1
+  %tmp4022 = getelementptr inbounds float* %tmp4021, i64 1
+  %tmp4023 = getelementptr inbounds float* %tmp4022, i64 1
+  %tmp4024 = getelementptr inbounds float* %tmp4023, i64 1
+  %tmp4025 = getelementptr inbounds float* %tmp4024, i64 1
+  %tmp4026 = getelementptr inbounds float* %tmp4025, i64 1
+  %tmp4027 = getelementptr inbounds float* %tmp4026, i64 1
+  %tmp4028 = getelementptr inbounds float* %tmp4027, i64 1
+  %tmp4029 = getelementptr inbounds float* %tmp4028, i64 1
+  %tmp4030 = getelementptr inbounds float* %tmp4029, i64 1
+  %tmp4031 = getelementptr inbounds float* %tmp4030, i64 1
+  %tmp4032 = getelementptr inbounds float* %tmp4031, i64 1
+  %tmp4033 = getelementptr inbounds float* %tmp4032, i64 1
+  %tmp4034 = getelementptr inbounds float* %tmp4033, i64 1
+  %tmp4035 = getelementptr inbounds float* %tmp4034, i64 1
+  %tmp4036 = getelementptr inbounds float* %tmp4035, i64 1
+  %tmp4037 = getelementptr inbounds float* %tmp4036, i64 1
+  %tmp4038 = getelementptr inbounds float* %tmp4037, i64 1
+  %tmp4039 = getelementptr inbounds float* %tmp4038, i64 1
+  %tmp4040 = getelementptr inbounds float* %tmp4039, i64 1
+  %tmp4041 = getelementptr inbounds float* %tmp4040, i64 1
+  %tmp4042 = getelementptr inbounds float* %tmp4041, i64 1
+  %tmp4043 = getelementptr inbounds float* %tmp4042, i64 1
+  %tmp4044 = getelementptr inbounds float* %tmp4043, i64 1
+  %tmp4045 = getelementptr inbounds float* %tmp4044, i64 1
+  %tmp4046 = getelementptr inbounds float* %tmp4045, i64 1
+  %tmp4047 = getelementptr inbounds float* %tmp4046, i64 1
+  %tmp4048 = getelementptr inbounds float* %tmp4047, i64 1
+  %tmp4049 = getelementptr inbounds float* %tmp4048, i64 1
+  %tmp4050 = getelementptr inbounds float* %tmp4049, i64 1
+  %tmp4051 = getelementptr inbounds float* %tmp4050, i64 1
+  %tmp4052 = getelementptr inbounds float* %tmp4051, i64 1
+  %tmp4053 = getelementptr inbounds float* %tmp4052, i64 1
+  %tmp4054 = getelementptr inbounds float* %tmp4053, i64 1
+  %tmp4055 = getelementptr inbounds float* %tmp4054, i64 1
+  %tmp4056 = getelementptr inbounds float* %tmp4055, i64 1
+  %tmp4057 = getelementptr inbounds float* %tmp4056, i64 1
+  %tmp4058 = getelementptr inbounds float* %tmp4057, i64 1
+  %tmp4059 = getelementptr inbounds float* %tmp4058, i64 1
+  %tmp4060 = getelementptr inbounds float* %tmp4059, i64 1
+  %tmp4061 = getelementptr inbounds float* %tmp4060, i64 1
+  %tmp4062 = getelementptr inbounds float* %tmp4061, i64 1
+  %tmp4063 = getelementptr inbounds float* %tmp4062, i64 1
+  %tmp4064 = getelementptr inbounds float* %tmp4063, i64 1
+  %tmp4065 = getelementptr inbounds float* %tmp4064, i64 1
+  %tmp4066 = getelementptr inbounds float* %tmp4065, i64 1
+  %tmp4067 = getelementptr inbounds float* %tmp4066, i64 1
+  %tmp4068 = getelementptr inbounds float* %tmp4067, i64 1
+  %tmp4069 = getelementptr inbounds float* %tmp4068, i64 1
+  %tmp4070 = getelementptr inbounds float* %tmp4069, i64 1
+  %tmp4071 = getelementptr inbounds float* %tmp4070, i64 1
+  %tmp4072 = getelementptr inbounds float* %tmp4071, i64 1
+  %tmp4073 = getelementptr inbounds float* %tmp4072, i64 1
+  %tmp4074 = getelementptr inbounds float* %tmp4073, i64 1
+  %tmp4075 = getelementptr inbounds float* %tmp4074, i64 1
+  %tmp4076 = getelementptr inbounds float* %tmp4075, i64 1
+  %tmp4077 = getelementptr inbounds float* %tmp4076, i64 1
+  %tmp4078 = getelementptr inbounds float* %tmp4077, i64 1
+  %tmp4079 = getelementptr inbounds float* %tmp4078, i64 1
+  %tmp4080 = getelementptr inbounds float* %tmp4079, i64 1
+  %tmp4081 = getelementptr inbounds float* %tmp4080, i64 1
+  %tmp4082 = getelementptr inbounds float* %tmp4081, i64 1
+  %tmp4083 = getelementptr inbounds float* %tmp4082, i64 1
+  %tmp4084 = getelementptr inbounds float* %tmp4083, i64 1
+  %tmp4085 = getelementptr inbounds float* %tmp4084, i64 1
+  %tmp4086 = getelementptr inbounds float* %tmp4085, i64 1
+  %tmp4087 = getelementptr inbounds float* %tmp4086, i64 1
+  %tmp4088 = getelementptr inbounds float* %tmp4087, i64 1
+  %tmp4089 = getelementptr inbounds float* %tmp4088, i64 1
+  %tmp4090 = getelementptr inbounds float* %tmp4089, i64 1
+  %tmp4091 = getelementptr inbounds float* %tmp4090, i64 1
+  %tmp4092 = getelementptr inbounds float* %tmp4091, i64 1
+  %tmp4093 = getelementptr inbounds float* %tmp4092, i64 1
+  %tmp4094 = getelementptr inbounds float* %tmp4093, i64 1
+  %tmp4095 = getelementptr inbounds float* %tmp4094, i64 1
+  %tmp4096 = getelementptr inbounds float* %tmp4095, i64 1
+  %tmp4097 = getelementptr inbounds float* %tmp4096, i64 1
+  %tmp4098 = getelementptr inbounds float* %tmp4097, i64 1
+  %tmp4099 = getelementptr inbounds float* %tmp4098, i64 1
+  %tmp4100 = getelementptr inbounds float* %tmp4099, i64 1
+  %tmp4101 = getelementptr inbounds float* %tmp4100, i64 1
+  %tmp4102 = getelementptr inbounds float* %tmp4101, i64 1
+  %tmp4103 = getelementptr inbounds float* %tmp4102, i64 1
+  %tmp4104 = getelementptr inbounds float* %tmp4103, i64 1
+  %tmp4105 = getelementptr inbounds float* %tmp4104, i64 1
+  %tmp4106 = getelementptr inbounds float* %tmp4105, i64 1
+  %tmp4107 = getelementptr inbounds float* %tmp4106, i64 1
+  %tmp4108 = getelementptr inbounds float* %tmp4107, i64 1
+  %tmp4109 = getelementptr inbounds float* %tmp4108, i64 1
+  %tmp4110 = getelementptr inbounds float* %tmp4109, i64 1
+  %tmp4111 = getelementptr inbounds float* %tmp4110, i64 1
+  %tmp4112 = getelementptr inbounds float* %tmp4111, i64 1
+  %tmp4113 = getelementptr inbounds float* %tmp4112, i64 1
+  %tmp4114 = getelementptr inbounds float* %tmp4113, i64 1
+  %tmp4115 = getelementptr inbounds float* %tmp4114, i64 1
+  %tmp4116 = getelementptr inbounds float* %tmp4115, i64 1
+  %tmp4117 = getelementptr inbounds float* %tmp4116, i64 1
+  %tmp4118 = getelementptr inbounds float* %tmp4117, i64 1
+  %tmp4119 = getelementptr inbounds float* %tmp4118, i64 1
+  %tmp4120 = getelementptr inbounds float* %tmp4119, i64 1
+  %tmp4121 = getelementptr inbounds float* %tmp4120, i64 1
+  %tmp4122 = getelementptr inbounds float* %tmp4121, i64 1
+  %tmp4123 = getelementptr inbounds float* %tmp4122, i64 1
+  %tmp4124 = getelementptr inbounds float* %tmp4123, i64 1
+  %tmp4125 = getelementptr inbounds float* %tmp4124, i64 1
+  %tmp4126 = getelementptr inbounds float* %tmp4125, i64 1
+  %tmp4127 = getelementptr inbounds float* %tmp4126, i64 1
+  %tmp4128 = getelementptr inbounds float* %tmp4127, i64 1
+  %tmp4129 = getelementptr inbounds float* %tmp4128, i64 1
+  %tmp4130 = getelementptr inbounds float* %tmp4129, i64 1
+  %tmp4131 = getelementptr inbounds float* %tmp4130, i64 1
+  %tmp4132 = getelementptr inbounds float* %tmp4131, i64 1
+  %tmp4133 = getelementptr inbounds float* %tmp4132, i64 1
+  %tmp4134 = getelementptr inbounds float* %tmp4133, i64 1
+  %tmp4135 = getelementptr inbounds float* %tmp4134, i64 1
+  %tmp4136 = getelementptr inbounds float* %tmp4135, i64 1
+  %tmp4137 = getelementptr inbounds float* %tmp4136, i64 1
+  %tmp4138 = getelementptr inbounds float* %tmp4137, i64 1
+  %tmp4139 = getelementptr inbounds float* %tmp4138, i64 1
+  %tmp4140 = getelementptr inbounds float* %tmp4139, i64 1
+  %tmp4141 = getelementptr inbounds float* %tmp4140, i64 1
+  %tmp4142 = getelementptr inbounds float* %tmp4141, i64 1
+  %tmp4143 = getelementptr inbounds float* %tmp4142, i64 1
+  %tmp4144 = getelementptr inbounds float* %tmp4143, i64 1
+  %tmp4145 = getelementptr inbounds float* %tmp4144, i64 1
+  %tmp4146 = getelementptr inbounds float* %tmp4145, i64 1
+  %tmp4147 = getelementptr inbounds float* %tmp4146, i64 1
+  %tmp4148 = getelementptr inbounds float* %tmp4147, i64 1
+  %tmp4149 = getelementptr inbounds float* %tmp4148, i64 1
+  %tmp4150 = getelementptr inbounds float* %tmp4149, i64 1
+  %tmp4151 = getelementptr inbounds float* %tmp4150, i64 1
+  %tmp4152 = getelementptr inbounds float* %tmp4151, i64 1
+  %tmp4153 = getelementptr inbounds float* %tmp4152, i64 1
+  %tmp4154 = getelementptr inbounds float* %tmp4153, i64 1
+  %tmp4155 = getelementptr inbounds float* %tmp4154, i64 1
+  %tmp4156 = getelementptr inbounds float* %tmp4155, i64 1
+  %tmp4157 = getelementptr inbounds float* %tmp4156, i64 1
+  %tmp4158 = getelementptr inbounds float* %tmp4157, i64 1
+  %tmp4159 = getelementptr inbounds float* %tmp4158, i64 1
+  %tmp4160 = getelementptr inbounds float* %tmp4159, i64 1
+  %tmp4161 = getelementptr inbounds float* %tmp4160, i64 1
+  %tmp4162 = getelementptr inbounds float* %tmp4161, i64 1
+  %tmp4163 = getelementptr inbounds float* %tmp4162, i64 1
+  %tmp4164 = getelementptr inbounds float* %tmp4163, i64 1
+  %tmp4165 = getelementptr inbounds float* %tmp4164, i64 1
+  %tmp4166 = getelementptr inbounds float* %tmp4165, i64 1
+  %tmp4167 = getelementptr inbounds float* %tmp4166, i64 1
+  %tmp4168 = getelementptr inbounds float* %tmp4167, i64 1
+  %tmp4169 = getelementptr inbounds float* %tmp4168, i64 1
+  %tmp4170 = getelementptr inbounds float* %tmp4169, i64 1
+  %tmp4171 = getelementptr inbounds float* %tmp4170, i64 1
+  %tmp4172 = getelementptr inbounds float* %tmp4171, i64 1
+  %tmp4173 = getelementptr inbounds float* %tmp4172, i64 1
+  %tmp4174 = getelementptr inbounds float* %tmp4173, i64 1
+  %tmp4175 = getelementptr inbounds float* %tmp4174, i64 1
+  %tmp4176 = getelementptr inbounds float* %tmp4175, i64 1
+  %tmp4177 = getelementptr inbounds float* %tmp4176, i64 1
+  %tmp4178 = getelementptr inbounds float* %tmp4177, i64 1
+  %tmp4179 = getelementptr inbounds float* %tmp4178, i64 1
+  %tmp4180 = getelementptr inbounds float* %tmp4179, i64 1
+  %tmp4181 = getelementptr inbounds float* %tmp4180, i64 1
+  %tmp4182 = getelementptr inbounds float* %tmp4181, i64 1
+  %tmp4183 = getelementptr inbounds float* %tmp4182, i64 1
+  %tmp4184 = getelementptr inbounds float* %tmp4183, i64 1
+  %tmp4185 = getelementptr inbounds float* %tmp4184, i64 1
+  %tmp4186 = getelementptr inbounds float* %tmp4185, i64 1
+  %tmp4187 = getelementptr inbounds float* %tmp4186, i64 1
+  %tmp4188 = getelementptr inbounds float* %tmp4187, i64 1
+  %tmp4189 = getelementptr inbounds float* %tmp4188, i64 1
+  %tmp4190 = getelementptr inbounds float* %tmp4189, i64 1
+  %tmp4191 = getelementptr inbounds float* %tmp4190, i64 1
+  %tmp4192 = getelementptr inbounds float* %tmp4191, i64 1
+  %tmp4193 = getelementptr inbounds float* %tmp4192, i64 1
+  %tmp4194 = getelementptr inbounds float* %tmp4193, i64 1
+  %tmp4195 = getelementptr inbounds float* %tmp4194, i64 1
+  %tmp4196 = getelementptr inbounds float* %tmp4195, i64 1
+  %tmp4197 = getelementptr inbounds float* %tmp4196, i64 1
+  %tmp4198 = getelementptr inbounds float* %tmp4197, i64 1
+  %tmp4199 = getelementptr inbounds float* %tmp4198, i64 1
+  %tmp4200 = getelementptr inbounds float* %tmp4199, i64 1
+  %tmp4201 = getelementptr inbounds float* %tmp4200, i64 1
+  %tmp4202 = getelementptr inbounds float* %tmp4201, i64 1
+  %tmp4203 = getelementptr inbounds float* %tmp4202, i64 1
+  %tmp4204 = getelementptr inbounds float* %tmp4203, i64 1
+  %tmp4205 = getelementptr inbounds float* %tmp4204, i64 1
+  %tmp4206 = getelementptr inbounds float* %tmp4205, i64 1
+  %tmp4207 = getelementptr inbounds float* %tmp4206, i64 1
+  %tmp4208 = getelementptr inbounds float* %tmp4207, i64 1
+  %tmp4209 = getelementptr inbounds float* %tmp4208, i64 1
+  %tmp4210 = getelementptr inbounds float* %tmp4209, i64 1
+  %tmp4211 = getelementptr inbounds float* %tmp4210, i64 1
+  %tmp4212 = getelementptr inbounds float* %tmp4211, i64 1
+  %tmp4213 = getelementptr inbounds float* %tmp4212, i64 1
+  %tmp4214 = getelementptr inbounds float* %tmp4213, i64 1
+  %tmp4215 = getelementptr inbounds float* %tmp4214, i64 1
+  %tmp4216 = getelementptr inbounds float* %tmp4215, i64 1
+  %tmp4217 = getelementptr inbounds float* %tmp4216, i64 1
+  %tmp4218 = getelementptr inbounds float* %tmp4217, i64 1
+  %tmp4219 = getelementptr inbounds float* %tmp4218, i64 1
+  %tmp4220 = getelementptr inbounds float* %tmp4219, i64 1
+  %tmp4221 = getelementptr inbounds float* %tmp4220, i64 1
+  %tmp4222 = getelementptr inbounds float* %tmp4221, i64 1
+  %tmp4223 = getelementptr inbounds float* %tmp4222, i64 1
+  %tmp4224 = getelementptr inbounds float* %tmp4223, i64 1
+  %tmp4225 = getelementptr inbounds float* %tmp4224, i64 1
+  %tmp4226 = getelementptr inbounds float* %tmp4225, i64 1
+  %tmp4227 = getelementptr inbounds float* %tmp4226, i64 1
+  %tmp4228 = getelementptr inbounds float* %tmp4227, i64 1
+  %tmp4229 = getelementptr inbounds float* %tmp4228, i64 1
+  %tmp4230 = getelementptr inbounds float* %tmp4229, i64 1
+  %tmp4231 = getelementptr inbounds float* %tmp4230, i64 1
+  %tmp4232 = getelementptr inbounds float* %tmp4231, i64 1
+  %tmp4233 = getelementptr inbounds float* %tmp4232, i64 1
+  %tmp4234 = getelementptr inbounds float* %tmp4233, i64 1
+  %tmp4235 = getelementptr inbounds float* %tmp4234, i64 1
+  %tmp4236 = getelementptr inbounds float* %tmp4235, i64 1
+  %tmp4237 = getelementptr inbounds float* %tmp4236, i64 1
+  %tmp4238 = getelementptr inbounds float* %tmp4237, i64 1
+  %tmp4239 = getelementptr inbounds float* %tmp4238, i64 1
+  %tmp4240 = getelementptr inbounds float* %tmp4239, i64 1
+  %tmp4241 = getelementptr inbounds float* %tmp4240, i64 1
+  %tmp4242 = getelementptr inbounds float* %tmp4241, i64 1
+  %tmp4243 = getelementptr inbounds float* %tmp4242, i64 1
+  %tmp4244 = getelementptr inbounds float* %tmp4243, i64 1
+  %tmp4245 = getelementptr inbounds float* %tmp4244, i64 1
+  %tmp4246 = getelementptr inbounds float* %tmp4245, i64 1
+  %tmp4247 = getelementptr inbounds float* %tmp4246, i64 1
+  %tmp4248 = getelementptr inbounds float* %tmp4247, i64 1
+  %tmp4249 = getelementptr inbounds float* %tmp4248, i64 1
+  %tmp4250 = getelementptr inbounds float* %tmp4249, i64 1
+  %tmp4251 = getelementptr inbounds float* %tmp4250, i64 1
+  %tmp4252 = getelementptr inbounds float* %tmp4251, i64 1
+  %tmp4253 = getelementptr inbounds float* %tmp4252, i64 1
+  %tmp4254 = getelementptr inbounds float* %tmp4253, i64 1
+  %tmp4255 = getelementptr inbounds float* %tmp4254, i64 1
+  %tmp4256 = getelementptr inbounds float* %tmp4255, i64 1
+  %tmp4257 = getelementptr inbounds float* %tmp4256, i64 1
+  %tmp4258 = getelementptr inbounds float* %tmp4257, i64 1
+  %tmp4259 = getelementptr inbounds float* %tmp4258, i64 1
+  %tmp4260 = getelementptr inbounds float* %tmp4259, i64 1
+  %tmp4261 = getelementptr inbounds float* %tmp4260, i64 1
+  %tmp4262 = getelementptr inbounds float* %tmp4261, i64 1
+  %tmp4263 = getelementptr inbounds float* %tmp4262, i64 1
+  %tmp4264 = getelementptr inbounds float* %tmp4263, i64 1
+  %tmp4265 = getelementptr inbounds float* %tmp4264, i64 1
+  %tmp4266 = getelementptr inbounds float* %tmp4265, i64 1
+  %tmp4267 = getelementptr inbounds float* %tmp4266, i64 1
+  %tmp4268 = getelementptr inbounds float* %tmp4267, i64 1
+  %tmp4269 = getelementptr inbounds float* %tmp4268, i64 1
+  %tmp4270 = getelementptr inbounds float* %tmp4269, i64 1
+  %tmp4271 = getelementptr inbounds float* %tmp4270, i64 1
+  %tmp4272 = getelementptr inbounds float* %tmp4271, i64 1
+  %tmp4273 = getelementptr inbounds float* %tmp4272, i64 1
+  %tmp4274 = getelementptr inbounds float* %tmp4273, i64 1
+  %tmp4275 = getelementptr inbounds float* %tmp4274, i64 1
+  %tmp4276 = getelementptr inbounds float* %tmp4275, i64 1
+  %tmp4277 = getelementptr inbounds float* %tmp4276, i64 1
+  %tmp4278 = getelementptr inbounds float* %tmp4277, i64 1
+  %tmp4279 = getelementptr inbounds float* %tmp4278, i64 1
+  %tmp4280 = getelementptr inbounds float* %tmp4279, i64 1
+  %tmp4281 = getelementptr inbounds float* %tmp4280, i64 1
+  %tmp4282 = getelementptr inbounds float* %tmp4281, i64 1
+  %tmp4283 = getelementptr inbounds float* %tmp4282, i64 1
+  %tmp4284 = getelementptr inbounds float* %tmp4283, i64 1
+  %tmp4285 = getelementptr inbounds float* %tmp4284, i64 1
+  %tmp4286 = getelementptr inbounds float* %tmp4285, i64 1
+  %tmp4287 = getelementptr inbounds float* %tmp4286, i64 1
+  %tmp4288 = getelementptr inbounds float* %tmp4287, i64 1
+  %tmp4289 = getelementptr inbounds float* %tmp4288, i64 1
+  %tmp4290 = getelementptr inbounds float* %tmp4289, i64 1
+  %tmp4291 = getelementptr inbounds float* %tmp4290, i64 1
+  %tmp4292 = getelementptr inbounds float* %tmp4291, i64 1
+  %tmp4293 = getelementptr inbounds float* %tmp4292, i64 1
+  %tmp4294 = getelementptr inbounds float* %tmp4293, i64 1
+  %tmp4295 = getelementptr inbounds float* %tmp4294, i64 1
+  %tmp4296 = getelementptr inbounds float* %tmp4295, i64 1
+  %tmp4297 = getelementptr inbounds float* %tmp4296, i64 1
+  %tmp4298 = getelementptr inbounds float* %tmp4297, i64 1
+  %tmp4299 = getelementptr inbounds float* %tmp4298, i64 1
+  %tmp4300 = getelementptr inbounds float* %tmp4299, i64 1
+  %tmp4301 = getelementptr inbounds float* %tmp4300, i64 1
+  %tmp4302 = getelementptr inbounds float* %tmp4301, i64 1
+  %tmp4303 = getelementptr inbounds float* %tmp4302, i64 1
+  %tmp4304 = getelementptr inbounds float* %tmp4303, i64 1
+  %tmp4305 = getelementptr inbounds float* %tmp4304, i64 1
+  %tmp4306 = getelementptr inbounds float* %tmp4305, i64 1
+  %tmp4307 = getelementptr inbounds float* %tmp4306, i64 1
+  %tmp4308 = getelementptr inbounds float* %tmp4307, i64 1
+  %tmp4309 = getelementptr inbounds float* %tmp4308, i64 1
+  %tmp4310 = getelementptr inbounds float* %tmp4309, i64 1
+  %tmp4311 = getelementptr inbounds float* %tmp4310, i64 1
+  %tmp4312 = getelementptr inbounds float* %tmp4311, i64 1
+  %tmp4313 = getelementptr inbounds float* %tmp4312, i64 1
+  %tmp4314 = getelementptr inbounds float* %tmp4313, i64 1
+  %tmp4315 = getelementptr inbounds float* %tmp4314, i64 1
+  %tmp4316 = getelementptr inbounds float* %tmp4315, i64 1
+  %tmp4317 = getelementptr inbounds float* %tmp4316, i64 1
+  %tmp4318 = getelementptr inbounds float* %tmp4317, i64 1
+  %tmp4319 = getelementptr inbounds float* %tmp4318, i64 1
+  %tmp4320 = getelementptr inbounds float* %tmp4319, i64 1
+  %tmp4321 = getelementptr inbounds float* %tmp4320, i64 1
+  %tmp4322 = getelementptr inbounds float* %tmp4321, i64 1
+  %tmp4323 = getelementptr inbounds float* %tmp4322, i64 1
+  %tmp4324 = getelementptr inbounds float* %tmp4323, i64 1
+  %tmp4325 = getelementptr inbounds float* %tmp4324, i64 1
+  %tmp4326 = getelementptr inbounds float* %tmp4325, i64 1
+  %tmp4327 = getelementptr inbounds float* %tmp4326, i64 1
+  %tmp4328 = getelementptr inbounds float* %tmp4327, i64 1
+  %tmp4329 = getelementptr inbounds float* %tmp4328, i64 1
+  %tmp4330 = getelementptr inbounds float* %tmp4329, i64 1
+  %tmp4331 = getelementptr inbounds float* %tmp4330, i64 1
+  %tmp4332 = getelementptr inbounds float* %tmp4331, i64 1
+  %tmp4333 = getelementptr inbounds float* %tmp4332, i64 1
+  %tmp4334 = getelementptr inbounds float* %tmp4333, i64 1
+  %tmp4335 = getelementptr inbounds float* %tmp4334, i64 1
+  %tmp4336 = getelementptr inbounds float* %tmp4335, i64 1
+  %tmp4337 = getelementptr inbounds float* %tmp4336, i64 1
+  %tmp4338 = getelementptr inbounds float* %tmp4337, i64 1
+  %tmp4339 = getelementptr inbounds float* %tmp4338, i64 1
+  %tmp4340 = getelementptr inbounds float* %tmp4339, i64 1
+  %tmp4341 = getelementptr inbounds float* %tmp4340, i64 1
+  %tmp4342 = getelementptr inbounds float* %tmp4341, i64 1
+  %tmp4343 = getelementptr inbounds float* %tmp4342, i64 1
+  %tmp4344 = getelementptr inbounds float* %tmp4343, i64 1
+  %tmp4345 = getelementptr inbounds float* %tmp4344, i64 1
+  %tmp4346 = getelementptr inbounds float* %tmp4345, i64 1
+  %tmp4347 = getelementptr inbounds float* %tmp4346, i64 1
+  %tmp4348 = getelementptr inbounds float* %tmp4347, i64 1
+  %tmp4349 = getelementptr inbounds float* %tmp4348, i64 1
+  %tmp4350 = getelementptr inbounds float* %tmp4349, i64 1
+  %tmp4351 = getelementptr inbounds float* %tmp4350, i64 1
+  %tmp4352 = getelementptr inbounds float* %tmp4351, i64 1
+  %tmp4353 = getelementptr inbounds float* %tmp4352, i64 1
+  %tmp4354 = getelementptr inbounds float* %tmp4353, i64 1
+  %tmp4355 = getelementptr inbounds float* %tmp4354, i64 1
+  %tmp4356 = getelementptr inbounds float* %tmp4355, i64 1
+  %tmp4357 = getelementptr inbounds float* %tmp4356, i64 1
+  %tmp4358 = getelementptr inbounds float* %tmp4357, i64 1
+  %tmp4359 = getelementptr inbounds float* %tmp4358, i64 1
+  %tmp4360 = getelementptr inbounds float* %tmp4359, i64 1
+  %tmp4361 = getelementptr inbounds float* %tmp4360, i64 1
+  %tmp4362 = getelementptr inbounds float* %tmp4361, i64 1
+  %tmp4363 = getelementptr inbounds float* %tmp4362, i64 1
+  %tmp4364 = getelementptr inbounds float* %tmp4363, i64 1
+  %tmp4365 = getelementptr inbounds float* %tmp4364, i64 1
+  %tmp4366 = getelementptr inbounds float* %tmp4365, i64 1
+  %tmp4367 = getelementptr inbounds float* %tmp4366, i64 1
+  %tmp4368 = getelementptr inbounds float* %tmp4367, i64 1
+  %tmp4369 = getelementptr inbounds float* %tmp4368, i64 1
+  %tmp4370 = getelementptr inbounds float* %tmp4369, i64 1
+  %tmp4371 = getelementptr inbounds float* %tmp4370, i64 1
+  %tmp4372 = getelementptr inbounds float* %tmp4371, i64 1
+  %tmp4373 = getelementptr inbounds float* %tmp4372, i64 1
+  %tmp4374 = getelementptr inbounds float* %tmp4373, i64 1
+  %tmp4375 = getelementptr inbounds float* %tmp4374, i64 1
+  %tmp4376 = getelementptr inbounds float* %tmp4375, i64 1
+  %tmp4377 = getelementptr inbounds float* %tmp4376, i64 1
+  %tmp4378 = getelementptr inbounds float* %tmp4377, i64 1
+  %tmp4379 = getelementptr inbounds float* %tmp4378, i64 1
+  %tmp4380 = getelementptr inbounds float* %tmp4379, i64 1
+  %tmp4381 = getelementptr inbounds float* %tmp4380, i64 1
+  %tmp4382 = getelementptr inbounds float* %tmp4381, i64 1
+  %tmp4383 = getelementptr inbounds float* %tmp4382, i64 1
+  %tmp4384 = getelementptr inbounds float* %tmp4383, i64 1
+  %tmp4385 = getelementptr inbounds float* %tmp4384, i64 1
+  %tmp4386 = getelementptr inbounds float* %tmp4385, i64 1
+  %tmp4387 = getelementptr inbounds float* %tmp4386, i64 1
+  %tmp4388 = getelementptr inbounds float* %tmp4387, i64 1
+  %tmp4389 = getelementptr inbounds float* %tmp4388, i64 1
+  %tmp4390 = getelementptr inbounds float* %tmp4389, i64 1
+  %tmp4391 = getelementptr inbounds float* %tmp4390, i64 1
+  %tmp4392 = getelementptr inbounds float* %tmp4391, i64 1
+  %tmp4393 = getelementptr inbounds float* %tmp4392, i64 1
+  %tmp4394 = getelementptr inbounds float* %tmp4393, i64 1
+  %tmp4395 = getelementptr inbounds float* %tmp4394, i64 1
+  %tmp4396 = getelementptr inbounds float* %tmp4395, i64 1
+  %tmp4397 = getelementptr inbounds float* %tmp4396, i64 1
+  %tmp4398 = getelementptr inbounds float* %tmp4397, i64 1
+  %tmp4399 = getelementptr inbounds float* %tmp4398, i64 1
+  %tmp4400 = getelementptr inbounds float* %tmp4399, i64 1
+  %tmp4401 = getelementptr inbounds float* %tmp4400, i64 1
+  %tmp4402 = getelementptr inbounds float* %tmp4401, i64 1
+  %tmp4403 = getelementptr inbounds float* %tmp4402, i64 1
+  %tmp4404 = getelementptr inbounds float* %tmp4403, i64 1
+  %tmp4405 = getelementptr inbounds float* %tmp4404, i64 1
+  %tmp4406 = getelementptr inbounds float* %tmp4405, i64 1
+  %tmp4407 = getelementptr inbounds float* %tmp4406, i64 1
+  %tmp4408 = getelementptr inbounds float* %tmp4407, i64 1
+  %tmp4409 = getelementptr inbounds float* %tmp4408, i64 1
+  %tmp4410 = getelementptr inbounds float* %tmp4409, i64 1
+  %tmp4411 = getelementptr inbounds float* %tmp4410, i64 1
+  %tmp4412 = getelementptr inbounds float* %tmp4411, i64 1
+  %tmp4413 = getelementptr inbounds float* %tmp4412, i64 1
+  %tmp4414 = getelementptr inbounds float* %tmp4413, i64 1
+  %tmp4415 = getelementptr inbounds float* %tmp4414, i64 1
+  %tmp4416 = getelementptr inbounds float* %tmp4415, i64 1
+  %tmp4417 = getelementptr inbounds float* %tmp4416, i64 1
+  %tmp4418 = getelementptr inbounds float* %tmp4417, i64 1
+  %tmp4419 = getelementptr inbounds float* %tmp4418, i64 1
+  %tmp4420 = getelementptr inbounds float* %tmp4419, i64 1
+  %tmp4421 = getelementptr inbounds float* %tmp4420, i64 1
+  %tmp4422 = getelementptr inbounds float* %tmp4421, i64 1
+  %tmp4423 = getelementptr inbounds float* %tmp4422, i64 1
+  %tmp4424 = getelementptr inbounds float* %tmp4423, i64 1
+  %tmp4425 = getelementptr inbounds float* %tmp4424, i64 1
+  %tmp4426 = getelementptr inbounds float* %tmp4425, i64 1
+  %tmp4427 = getelementptr inbounds float* %tmp4426, i64 1
+  %tmp4428 = getelementptr inbounds float* %tmp4427, i64 1
+  %tmp4429 = getelementptr inbounds float* %tmp4428, i64 1
+  %tmp4430 = getelementptr inbounds float* %tmp4429, i64 1
+  %tmp4431 = getelementptr inbounds float* %tmp4430, i64 1
+  %tmp4432 = getelementptr inbounds float* %tmp4431, i64 1
+  %tmp4433 = getelementptr inbounds float* %tmp4432, i64 1
+  %tmp4434 = getelementptr inbounds float* %tmp4433, i64 1
+  %tmp4435 = getelementptr inbounds float* %tmp4434, i64 1
+  %tmp4436 = getelementptr inbounds float* %tmp4435, i64 1
+  %tmp4437 = getelementptr inbounds float* %tmp4436, i64 1
+  %tmp4438 = getelementptr inbounds float* %tmp4437, i64 1
+  %tmp4439 = getelementptr inbounds float* %tmp4438, i64 1
+  %tmp4440 = getelementptr inbounds float* %tmp4439, i64 1
+  %tmp4441 = getelementptr inbounds float* %tmp4440, i64 1
+  %tmp4442 = getelementptr inbounds float* %tmp4441, i64 1
+  %tmp4443 = getelementptr inbounds float* %tmp4442, i64 1
+  %tmp4444 = getelementptr inbounds float* %tmp4443, i64 1
+  %tmp4445 = getelementptr inbounds float* %tmp4444, i64 1
+  %tmp4446 = getelementptr inbounds float* %tmp4445, i64 1
+  %tmp4447 = getelementptr inbounds float* %tmp4446, i64 1
+  %tmp4448 = getelementptr inbounds float* %tmp4447, i64 1
+  %tmp4449 = getelementptr inbounds float* %tmp4448, i64 1
+  %tmp4450 = getelementptr inbounds float* %tmp4449, i64 1
+  %tmp4451 = getelementptr inbounds float* %tmp4450, i64 1
+  %tmp4452 = getelementptr inbounds float* %tmp4451, i64 1
+  %tmp4453 = getelementptr inbounds float* %tmp4452, i64 1
+  %tmp4454 = getelementptr inbounds float* %tmp4453, i64 1
+  %tmp4455 = getelementptr inbounds float* %tmp4454, i64 1
+  %tmp4456 = getelementptr inbounds float* %tmp4455, i64 1
+  %tmp4457 = getelementptr inbounds float* %tmp4456, i64 1
+  %tmp4458 = getelementptr inbounds float* %tmp4457, i64 1
+  %tmp4459 = getelementptr inbounds float* %tmp4458, i64 1
+  %tmp4460 = getelementptr inbounds float* %tmp4459, i64 1
+  %tmp4461 = getelementptr inbounds float* %tmp4460, i64 1
+  %tmp4462 = getelementptr inbounds float* %tmp4461, i64 1
+  %tmp4463 = getelementptr inbounds float* %tmp4462, i64 1
+  %tmp4464 = getelementptr inbounds float* %tmp4463, i64 1
+  %tmp4465 = getelementptr inbounds float* %tmp4464, i64 1
+  %tmp4466 = getelementptr inbounds float* %tmp4465, i64 1
+  %tmp4467 = getelementptr inbounds float* %tmp4466, i64 1
+  %tmp4468 = getelementptr inbounds float* %tmp4467, i64 1
+  %tmp4469 = getelementptr inbounds float* %tmp4468, i64 1
+  %tmp4470 = getelementptr inbounds float* %tmp4469, i64 1
+  %tmp4471 = getelementptr inbounds float* %tmp4470, i64 1
+  %tmp4472 = getelementptr inbounds float* %tmp4471, i64 1
+  %tmp4473 = getelementptr inbounds float* %tmp4472, i64 1
+  %tmp4474 = getelementptr inbounds float* %tmp4473, i64 1
+  %tmp4475 = getelementptr inbounds float* %tmp4474, i64 1
+  %tmp4476 = getelementptr inbounds float* %tmp4475, i64 1
+  %tmp4477 = getelementptr inbounds float* %tmp4476, i64 1
+  %tmp4478 = getelementptr inbounds float* %tmp4477, i64 1
+  %tmp4479 = getelementptr inbounds float* %tmp4478, i64 1
+  %tmp4480 = getelementptr inbounds float* %tmp4479, i64 1
+  %tmp4481 = getelementptr inbounds float* %tmp4480, i64 1
+  %tmp4482 = getelementptr inbounds float* %tmp4481, i64 1
+  %tmp4483 = getelementptr inbounds float* %tmp4482, i64 1
+  %tmp4484 = getelementptr inbounds float* %tmp4483, i64 1
+  %tmp4485 = getelementptr inbounds float* %tmp4484, i64 1
+  %tmp4486 = getelementptr inbounds float* %tmp4485, i64 1
+  %tmp4487 = getelementptr inbounds float* %tmp4486, i64 1
+  %tmp4488 = getelementptr inbounds float* %tmp4487, i64 1
+  %tmp4489 = getelementptr inbounds float* %tmp4488, i64 1
+  %tmp4490 = getelementptr inbounds float* %tmp4489, i64 1
+  %tmp4491 = getelementptr inbounds float* %tmp4490, i64 1
+  %tmp4492 = getelementptr inbounds float* %tmp4491, i64 1
+  %tmp4493 = getelementptr inbounds float* %tmp4492, i64 1
+  %tmp4494 = getelementptr inbounds float* %tmp4493, i64 1
+  %tmp4495 = getelementptr inbounds float* %tmp4494, i64 1
+  %tmp4496 = getelementptr inbounds float* %tmp4495, i64 1
+  %tmp4497 = getelementptr inbounds float* %tmp4496, i64 1
+  %tmp4498 = getelementptr inbounds float* %tmp4497, i64 1
+  %tmp4499 = getelementptr inbounds float* %tmp4498, i64 1
+  %tmp4500 = getelementptr inbounds float* %tmp4499, i64 1
+  %tmp4501 = getelementptr inbounds float* %tmp4500, i64 1
+  %tmp4502 = getelementptr inbounds float* %tmp4501, i64 1
+  %tmp4503 = getelementptr inbounds float* %tmp4502, i64 1
+  %tmp4504 = getelementptr inbounds float* %tmp4503, i64 1
+  %tmp4505 = getelementptr inbounds float* %tmp4504, i64 1
+  %tmp4506 = getelementptr inbounds float* %tmp4505, i64 1
+  %tmp4507 = getelementptr inbounds float* %tmp4506, i64 1
+  %tmp4508 = getelementptr inbounds float* %tmp4507, i64 1
+  %tmp4509 = getelementptr inbounds float* %tmp4508, i64 1
+  %tmp4510 = getelementptr inbounds float* %tmp4509, i64 1
+  %tmp4511 = getelementptr inbounds float* %tmp4510, i64 1
+  %tmp4512 = getelementptr inbounds float* %tmp4511, i64 1
+  %tmp4513 = getelementptr inbounds float* %tmp4512, i64 1
+  %tmp4514 = getelementptr inbounds float* %tmp4513, i64 1
+  %tmp4515 = getelementptr inbounds float* %tmp4514, i64 1
+  %tmp4516 = getelementptr inbounds float* %tmp4515, i64 1
+  %tmp4517 = getelementptr inbounds float* %tmp4516, i64 1
+  %tmp4518 = getelementptr inbounds float* %tmp4517, i64 1
+  %tmp4519 = getelementptr inbounds float* %tmp4518, i64 1
+  %tmp4520 = getelementptr inbounds float* %tmp4519, i64 1
+  %tmp4521 = getelementptr inbounds float* %tmp4520, i64 1
+  %tmp4522 = getelementptr inbounds float* %tmp4521, i64 1
+  %tmp4523 = getelementptr inbounds float* %tmp4522, i64 1
+  %tmp4524 = getelementptr inbounds float* %tmp4523, i64 1
+  %tmp4525 = getelementptr inbounds float* %tmp4524, i64 1
+  %tmp4526 = getelementptr inbounds float* %tmp4525, i64 1
+  %tmp4527 = getelementptr inbounds float* %tmp4526, i64 1
+  %tmp4528 = getelementptr inbounds float* %tmp4527, i64 1
+  %tmp4529 = getelementptr inbounds float* %tmp4528, i64 1
+  %tmp4530 = getelementptr inbounds float* %tmp4529, i64 1
+  %tmp4531 = getelementptr inbounds float* %tmp4530, i64 1
+  %tmp4532 = getelementptr inbounds float* %tmp4531, i64 1
+  %tmp4533 = getelementptr inbounds float* %tmp4532, i64 1
+  %tmp4534 = getelementptr inbounds float* %tmp4533, i64 1
+  %tmp4535 = getelementptr inbounds float* %tmp4534, i64 1
+  %tmp4536 = getelementptr inbounds float* %tmp4535, i64 1
+  %tmp4537 = getelementptr inbounds float* %tmp4536, i64 1
+  %tmp4538 = getelementptr inbounds float* %tmp4537, i64 1
+  %tmp4539 = getelementptr inbounds float* %tmp4538, i64 1
+  %tmp4540 = getelementptr inbounds float* %tmp4539, i64 1
+  %tmp4541 = getelementptr inbounds float* %tmp4540, i64 1
+  %tmp4542 = getelementptr inbounds float* %tmp4541, i64 1
+  %tmp4543 = getelementptr inbounds float* %tmp4542, i64 1
+  %tmp4544 = getelementptr inbounds float* %tmp4543, i64 1
+  %tmp4545 = getelementptr inbounds float* %tmp4544, i64 1
+  %tmp4546 = getelementptr inbounds float* %tmp4545, i64 1
+  %tmp4547 = getelementptr inbounds float* %tmp4546, i64 1
+  %tmp4548 = getelementptr inbounds float* %tmp4547, i64 1
+  %tmp4549 = getelementptr inbounds float* %tmp4548, i64 1
+  %tmp4550 = getelementptr inbounds float* %tmp4549, i64 1
+  %tmp4551 = getelementptr inbounds float* %tmp4550, i64 1
+  %tmp4552 = getelementptr inbounds float* %tmp4551, i64 1
+  %tmp4553 = getelementptr inbounds float* %tmp4552, i64 1
+  %tmp4554 = getelementptr inbounds float* %tmp4553, i64 1
+  %tmp4555 = getelementptr inbounds float* %tmp4554, i64 1
+  %tmp4556 = getelementptr inbounds float* %tmp4555, i64 1
+  %tmp4557 = getelementptr inbounds float* %tmp4556, i64 1
+  %tmp4558 = getelementptr inbounds float* %tmp4557, i64 1
+  %tmp4559 = getelementptr inbounds float* %tmp4558, i64 1
+  %tmp4560 = getelementptr inbounds float* %tmp4559, i64 1
+  %tmp4561 = getelementptr inbounds float* %tmp4560, i64 1
+  %tmp4562 = getelementptr inbounds float* %tmp4561, i64 1
+  %tmp4563 = getelementptr inbounds float* %tmp4562, i64 1
+  %tmp4564 = getelementptr inbounds float* %tmp4563, i64 1
+  %tmp4565 = getelementptr inbounds float* %tmp4564, i64 1
+  %tmp4566 = getelementptr inbounds float* %tmp4565, i64 1
+  %tmp4567 = getelementptr inbounds float* %tmp4566, i64 1
+  %tmp4568 = getelementptr inbounds float* %tmp4567, i64 1
+  %tmp4569 = getelementptr inbounds float* %tmp4568, i64 1
+  %tmp4570 = getelementptr inbounds float* %tmp4569, i64 1
+  %tmp4571 = getelementptr inbounds float* %tmp4570, i64 1
+  %tmp4572 = getelementptr inbounds float* %tmp4571, i64 1
+  %tmp4573 = getelementptr inbounds float* %tmp4572, i64 1
+  %tmp4574 = getelementptr inbounds float* %tmp4573, i64 1
+  %tmp4575 = getelementptr inbounds float* %tmp4574, i64 1
+  %tmp4576 = getelementptr inbounds float* %tmp4575, i64 1
+  %tmp4577 = getelementptr inbounds float* %tmp4576, i64 1
+  %tmp4578 = getelementptr inbounds float* %tmp4577, i64 1
+  %tmp4579 = getelementptr inbounds float* %tmp4578, i64 1
+  %tmp4580 = getelementptr inbounds float* %tmp4579, i64 1
+  %tmp4581 = getelementptr inbounds float* %tmp4580, i64 1
+  %tmp4582 = getelementptr inbounds float* %tmp4581, i64 1
+  %tmp4583 = getelementptr inbounds float* %tmp4582, i64 1
+  %tmp4584 = getelementptr inbounds float* %tmp4583, i64 1
+  %tmp4585 = getelementptr inbounds float* %tmp4584, i64 1
+  %tmp4586 = getelementptr inbounds float* %tmp4585, i64 1
+  %tmp4587 = getelementptr inbounds float* %tmp4586, i64 1
+  %tmp4588 = getelementptr inbounds float* %tmp4587, i64 1
+  %tmp4589 = getelementptr inbounds float* %tmp4588, i64 1
+  %tmp4590 = getelementptr inbounds float* %tmp4589, i64 1
+  %tmp4591 = getelementptr inbounds float* %tmp4590, i64 1
+  %tmp4592 = getelementptr inbounds float* %tmp4591, i64 1
+  %tmp4593 = getelementptr inbounds float* %tmp4592, i64 1
+  %tmp4594 = getelementptr inbounds float* %tmp4593, i64 1
+  %tmp4595 = getelementptr inbounds float* %tmp4594, i64 1
+  %tmp4596 = getelementptr inbounds float* %tmp4595, i64 1
+  %tmp4597 = getelementptr inbounds float* %tmp4596, i64 1
+  %tmp4598 = getelementptr inbounds float* %tmp4597, i64 1
+  %tmp4599 = getelementptr inbounds float* %tmp4598, i64 1
+  %tmp4600 = getelementptr inbounds float* %tmp4599, i64 1
+  %tmp4601 = getelementptr inbounds float* %tmp4600, i64 1
+  %tmp4602 = getelementptr inbounds float* %tmp4601, i64 1
+  %tmp4603 = getelementptr inbounds float* %tmp4602, i64 1
+  %tmp4604 = getelementptr inbounds float* %tmp4603, i64 1
+  %tmp4605 = getelementptr inbounds float* %tmp4604, i64 1
+  %tmp4606 = getelementptr inbounds float* %tmp4605, i64 1
+  %tmp4607 = getelementptr inbounds float* %tmp4606, i64 1
+  %tmp4608 = getelementptr inbounds float* %tmp4607, i64 1
+  %tmp4609 = getelementptr inbounds float* %tmp4608, i64 1
+  %tmp4610 = getelementptr inbounds float* %tmp4609, i64 1
+  %tmp4611 = getelementptr inbounds float* %tmp4610, i64 1
+  %tmp4612 = getelementptr inbounds float* %tmp4611, i64 1
+  %tmp4613 = getelementptr inbounds float* %tmp4612, i64 1
+  %tmp4614 = getelementptr inbounds float* %tmp4613, i64 1
+  %tmp4615 = getelementptr inbounds float* %tmp4614, i64 1
+  %tmp4616 = getelementptr inbounds float* %tmp4615, i64 1
+  %tmp4617 = getelementptr inbounds float* %tmp4616, i64 1
+  %tmp4618 = getelementptr inbounds float* %tmp4617, i64 1
+  %tmp4619 = getelementptr inbounds float* %tmp4618, i64 1
+  %tmp4620 = getelementptr inbounds float* %tmp4619, i64 1
+  %tmp4621 = getelementptr inbounds float* %tmp4620, i64 1
+  %tmp4622 = getelementptr inbounds float* %tmp4621, i64 1
+  %tmp4623 = getelementptr inbounds float* %tmp4622, i64 1
+  %tmp4624 = getelementptr inbounds float* %tmp4623, i64 1
+  %tmp4625 = getelementptr inbounds float* %tmp4624, i64 1
+  %tmp4626 = getelementptr inbounds float* %tmp4625, i64 1
+  %tmp4627 = getelementptr inbounds float* %tmp4626, i64 1
+  %tmp4628 = getelementptr inbounds float* %tmp4627, i64 1
+  %tmp4629 = getelementptr inbounds float* %tmp4628, i64 1
+  %tmp4630 = getelementptr inbounds float* %tmp4629, i64 1
+  %tmp4631 = getelementptr inbounds float* %tmp4630, i64 1
+  %tmp4632 = getelementptr inbounds float* %tmp4631, i64 1
+  %tmp4633 = getelementptr inbounds float* %tmp4632, i64 1
+  %tmp4634 = getelementptr inbounds float* %tmp4633, i64 1
+  %tmp4635 = getelementptr inbounds float* %tmp4634, i64 1
+  %tmp4636 = getelementptr inbounds float* %tmp4635, i64 1
+  %tmp4637 = getelementptr inbounds float* %tmp4636, i64 1
+  %tmp4638 = getelementptr inbounds float* %tmp4637, i64 1
+  %tmp4639 = getelementptr inbounds float* %tmp4638, i64 1
+  %tmp4640 = getelementptr inbounds float* %tmp4639, i64 1
+  %tmp4641 = getelementptr inbounds float* %tmp4640, i64 1
+  %tmp4642 = getelementptr inbounds float* %tmp4641, i64 1
+  %tmp4643 = getelementptr inbounds float* %tmp4642, i64 1
+  %tmp4644 = getelementptr inbounds float* %tmp4643, i64 1
+  %tmp4645 = getelementptr inbounds float* %tmp4644, i64 1
+  %tmp4646 = getelementptr inbounds float* %tmp4645, i64 1
+  %tmp4647 = getelementptr inbounds float* %tmp4646, i64 1
+  %tmp4648 = getelementptr inbounds float* %tmp4647, i64 1
+  %tmp4649 = getelementptr inbounds float* %tmp4648, i64 1
+  %tmp4650 = getelementptr inbounds float* %tmp4649, i64 1
+  %tmp4651 = getelementptr inbounds float* %tmp4650, i64 1
+  %tmp4652 = getelementptr inbounds float* %tmp4651, i64 1
+  %tmp4653 = getelementptr inbounds float* %tmp4652, i64 1
+  %tmp4654 = getelementptr inbounds float* %tmp4653, i64 1
+  %tmp4655 = getelementptr inbounds float* %tmp4654, i64 1
+  %tmp4656 = getelementptr inbounds float* %tmp4655, i64 1
+  %tmp4657 = getelementptr inbounds float* %tmp4656, i64 1
+  %tmp4658 = getelementptr inbounds float* %tmp4657, i64 1
+  %tmp4659 = getelementptr inbounds float* %tmp4658, i64 1
+  %tmp4660 = getelementptr inbounds float* %tmp4659, i64 1
+  %tmp4661 = getelementptr inbounds float* %tmp4660, i64 1
+  %tmp4662 = getelementptr inbounds float* %tmp4661, i64 1
+  %tmp4663 = getelementptr inbounds float* %tmp4662, i64 1
+  %tmp4664 = getelementptr inbounds float* %tmp4663, i64 1
+  %tmp4665 = getelementptr inbounds float* %tmp4664, i64 1
+  %tmp4666 = getelementptr inbounds float* %tmp4665, i64 1
+  %tmp4667 = getelementptr inbounds float* %tmp4666, i64 1
+  %tmp4668 = getelementptr inbounds float* %tmp4667, i64 1
+  %tmp4669 = getelementptr inbounds float* %tmp4668, i64 1
+  %tmp4670 = getelementptr inbounds float* %tmp4669, i64 1
+  %tmp4671 = getelementptr inbounds float* %tmp4670, i64 1
+  %tmp4672 = getelementptr inbounds float* %tmp4671, i64 1
+  %tmp4673 = getelementptr inbounds float* %tmp4672, i64 1
+  %tmp4674 = getelementptr inbounds float* %tmp4673, i64 1
+  %tmp4675 = getelementptr inbounds float* %tmp4674, i64 1
+  %tmp4676 = getelementptr inbounds float* %tmp4675, i64 1
+  %tmp4677 = getelementptr inbounds float* %tmp4676, i64 1
+  %tmp4678 = getelementptr inbounds float* %tmp4677, i64 1
+  %tmp4679 = getelementptr inbounds float* %tmp4678, i64 1
+  %tmp4680 = getelementptr inbounds float* %tmp4679, i64 1
+  %tmp4681 = getelementptr inbounds float* %tmp4680, i64 1
+  %tmp4682 = getelementptr inbounds float* %tmp4681, i64 1
+  %tmp4683 = getelementptr inbounds float* %tmp4682, i64 1
+  %tmp4684 = getelementptr inbounds float* %tmp4683, i64 1
+  %tmp4685 = getelementptr inbounds float* %tmp4684, i64 1
+  %tmp4686 = getelementptr inbounds float* %tmp4685, i64 1
+  %tmp4687 = getelementptr inbounds float* %tmp4686, i64 1
+  %tmp4688 = getelementptr inbounds float* %tmp4687, i64 1
+  %tmp4689 = getelementptr inbounds float* %tmp4688, i64 1
+  %tmp4690 = getelementptr inbounds float* %tmp4689, i64 1
+  %tmp4691 = getelementptr inbounds float* %tmp4690, i64 1
+  %tmp4692 = getelementptr inbounds float* %tmp4691, i64 1
+  %tmp4693 = getelementptr inbounds float* %tmp4692, i64 1
+  %tmp4694 = getelementptr inbounds float* %tmp4693, i64 1
+  %tmp4695 = getelementptr inbounds float* %tmp4694, i64 1
+  %tmp4696 = getelementptr inbounds float* %tmp4695, i64 1
+  %tmp4697 = getelementptr inbounds float* %tmp4696, i64 1
+  %tmp4698 = getelementptr inbounds float* %tmp4697, i64 1
+  %tmp4699 = getelementptr inbounds float* %tmp4698, i64 1
+  %tmp4700 = getelementptr inbounds float* %tmp4699, i64 1
+  %tmp4701 = getelementptr inbounds float* %tmp4700, i64 1
+  %tmp4702 = getelementptr inbounds float* %tmp4701, i64 1
+  %tmp4703 = getelementptr inbounds float* %tmp4702, i64 1
+  %tmp4704 = getelementptr inbounds float* %tmp4703, i64 1
+  %tmp4705 = getelementptr inbounds float* %tmp4704, i64 1
+  %tmp4706 = getelementptr inbounds float* %tmp4705, i64 1
+  %tmp4707 = getelementptr inbounds float* %tmp4706, i64 1
+  %tmp4708 = getelementptr inbounds float* %tmp4707, i64 1
+  %tmp4709 = getelementptr inbounds float* %tmp4708, i64 1
+  %tmp4710 = getelementptr inbounds float* %tmp4709, i64 1
+  %tmp4711 = getelementptr inbounds float* %tmp4710, i64 1
+  %tmp4712 = getelementptr inbounds float* %tmp4711, i64 1
+  %tmp4713 = getelementptr inbounds float* %tmp4712, i64 1
+  %tmp4714 = getelementptr inbounds float* %tmp4713, i64 1
+  %tmp4715 = getelementptr inbounds float* %tmp4714, i64 1
+  %tmp4716 = getelementptr inbounds float* %tmp4715, i64 1
+  %tmp4717 = getelementptr inbounds float* %tmp4716, i64 1
+  %tmp4718 = getelementptr inbounds float* %tmp4717, i64 1
+  %tmp4719 = getelementptr inbounds float* %tmp4718, i64 1
+  %tmp4720 = getelementptr inbounds float* %tmp4719, i64 1
+  %tmp4721 = getelementptr inbounds float* %tmp4720, i64 1
+  %tmp4722 = getelementptr inbounds float* %tmp4721, i64 1
+  %tmp4723 = getelementptr inbounds float* %tmp4722, i64 1
+  %tmp4724 = getelementptr inbounds float* %tmp4723, i64 1
+  %tmp4725 = getelementptr inbounds float* %tmp4724, i64 1
+  %tmp4726 = getelementptr inbounds float* %tmp4725, i64 1
+  %tmp4727 = getelementptr inbounds float* %tmp4726, i64 1
+  %tmp4728 = getelementptr inbounds float* %tmp4727, i64 1
+  %tmp4729 = getelementptr inbounds float* %tmp4728, i64 1
+  %tmp4730 = getelementptr inbounds float* %tmp4729, i64 1
+  %tmp4731 = getelementptr inbounds float* %tmp4730, i64 1
+  %tmp4732 = getelementptr inbounds float* %tmp4731, i64 1
+  %tmp4733 = getelementptr inbounds float* %tmp4732, i64 1
+  %tmp4734 = getelementptr inbounds float* %tmp4733, i64 1
+  %tmp4735 = getelementptr inbounds float* %tmp4734, i64 1
+  %tmp4736 = getelementptr inbounds float* %tmp4735, i64 1
+  %tmp4737 = getelementptr inbounds float* %tmp4736, i64 1
+  %tmp4738 = getelementptr inbounds float* %tmp4737, i64 1
+  %tmp4739 = getelementptr inbounds float* %tmp4738, i64 1
+  %tmp4740 = getelementptr inbounds float* %tmp4739, i64 1
+  %tmp4741 = getelementptr inbounds float* %tmp4740, i64 1
+  %tmp4742 = getelementptr inbounds float* %tmp4741, i64 1
+  %tmp4743 = getelementptr inbounds float* %tmp4742, i64 1
+  %tmp4744 = getelementptr inbounds float* %tmp4743, i64 1
+  %tmp4745 = getelementptr inbounds float* %tmp4744, i64 1
+  %tmp4746 = getelementptr inbounds float* %tmp4745, i64 1
+  %tmp4747 = getelementptr inbounds float* %tmp4746, i64 1
+  %tmp4748 = getelementptr inbounds float* %tmp4747, i64 1
+  %tmp4749 = getelementptr inbounds float* %tmp4748, i64 1
+  %tmp4750 = getelementptr inbounds float* %tmp4749, i64 1
+  %tmp4751 = getelementptr inbounds float* %tmp4750, i64 1
+  %tmp4752 = getelementptr inbounds float* %tmp4751, i64 1
+  %tmp4753 = getelementptr inbounds float* %tmp4752, i64 1
+  %tmp4754 = getelementptr inbounds float* %tmp4753, i64 1
+  %tmp4755 = getelementptr inbounds float* %tmp4754, i64 1
+  %tmp4756 = getelementptr inbounds float* %tmp4755, i64 1
+  %tmp4757 = getelementptr inbounds float* %tmp4756, i64 1
+  %tmp4758 = getelementptr inbounds float* %tmp4757, i64 1
+  %tmp4759 = getelementptr inbounds float* %tmp4758, i64 1
+  %tmp4760 = getelementptr inbounds float* %tmp4759, i64 1
+  %tmp4761 = getelementptr inbounds float* %tmp4760, i64 1
+  %tmp4762 = getelementptr inbounds float* %tmp4761, i64 1
+  %tmp4763 = getelementptr inbounds float* %tmp4762, i64 1
+  %tmp4764 = getelementptr inbounds float* %tmp4763, i64 1
+  %tmp4765 = getelementptr inbounds float* %tmp4764, i64 1
+  %tmp4766 = getelementptr inbounds float* %tmp4765, i64 1
+  %tmp4767 = getelementptr inbounds float* %tmp4766, i64 1
+  %tmp4768 = getelementptr inbounds float* %tmp4767, i64 1
+  %tmp4769 = getelementptr inbounds float* %tmp4768, i64 1
+  %tmp4770 = getelementptr inbounds float* %tmp4769, i64 1
+  %tmp4771 = getelementptr inbounds float* %tmp4770, i64 1
+  %tmp4772 = getelementptr inbounds float* %tmp4771, i64 1
+  %tmp4773 = getelementptr inbounds float* %tmp4772, i64 1
+  %tmp4774 = getelementptr inbounds float* %tmp4773, i64 1
+  %tmp4775 = getelementptr inbounds float* %tmp4774, i64 1
+  %tmp4776 = getelementptr inbounds float* %tmp4775, i64 1
+  %tmp4777 = getelementptr inbounds float* %tmp4776, i64 1
+  %tmp4778 = getelementptr inbounds float* %tmp4777, i64 1
+  %tmp4779 = getelementptr inbounds float* %tmp4778, i64 1
+  %tmp4780 = getelementptr inbounds float* %tmp4779, i64 1
+  %tmp4781 = getelementptr inbounds float* %tmp4780, i64 1
+  %tmp4782 = getelementptr inbounds float* %tmp4781, i64 1
+  %tmp4783 = getelementptr inbounds float* %tmp4782, i64 1
+  %tmp4784 = getelementptr inbounds float* %tmp4783, i64 1
+  %tmp4785 = getelementptr inbounds float* %tmp4784, i64 1
+  %tmp4786 = getelementptr inbounds float* %tmp4785, i64 1
+  %tmp4787 = getelementptr inbounds float* %tmp4786, i64 1
+  %tmp4788 = getelementptr inbounds float* %tmp4787, i64 1
+  %tmp4789 = getelementptr inbounds float* %tmp4788, i64 1
+  %tmp4790 = getelementptr inbounds float* %tmp4789, i64 1
+  %tmp4791 = getelementptr inbounds float* %tmp4790, i64 1
+  %tmp4792 = getelementptr inbounds float* %tmp4791, i64 1
+  %tmp4793 = getelementptr inbounds float* %tmp4792, i64 1
+  %tmp4794 = getelementptr inbounds float* %tmp4793, i64 1
+  %tmp4795 = getelementptr inbounds float* %tmp4794, i64 1
+  %tmp4796 = getelementptr inbounds float* %tmp4795, i64 1
+  %tmp4797 = getelementptr inbounds float* %tmp4796, i64 1
+  %tmp4798 = getelementptr inbounds float* %tmp4797, i64 1
+  %tmp4799 = getelementptr inbounds float* %tmp4798, i64 1
+  %tmp4800 = getelementptr inbounds float* %tmp4799, i64 1
+  %tmp4801 = getelementptr inbounds float* %tmp4800, i64 1
+  %tmp4802 = getelementptr inbounds float* %tmp4801, i64 1
+  %tmp4803 = getelementptr inbounds float* %tmp4802, i64 1
+  %tmp4804 = getelementptr inbounds float* %tmp4803, i64 1
+  %tmp4805 = getelementptr inbounds float* %tmp4804, i64 1
+  %tmp4806 = getelementptr inbounds float* %tmp4805, i64 1
+  %tmp4807 = getelementptr inbounds float* %tmp4806, i64 1
+  %tmp4808 = getelementptr inbounds float* %tmp4807, i64 1
+  %tmp4809 = getelementptr inbounds float* %tmp4808, i64 1
+  %tmp4810 = getelementptr inbounds float* %tmp4809, i64 1
+  %tmp4811 = getelementptr inbounds float* %tmp4810, i64 1
+  %tmp4812 = getelementptr inbounds float* %tmp4811, i64 1
+  %tmp4813 = getelementptr inbounds float* %tmp4812, i64 1
+  %tmp4814 = getelementptr inbounds float* %tmp4813, i64 1
+  %tmp4815 = getelementptr inbounds float* %tmp4814, i64 1
+  %tmp4816 = getelementptr inbounds float* %tmp4815, i64 1
+  %tmp4817 = getelementptr inbounds float* %tmp4816, i64 1
+  %tmp4818 = getelementptr inbounds float* %tmp4817, i64 1
+  %tmp4819 = getelementptr inbounds float* %tmp4818, i64 1
+  %tmp4820 = getelementptr inbounds float* %tmp4819, i64 1
+  %tmp4821 = getelementptr inbounds float* %tmp4820, i64 1
+  %tmp4822 = getelementptr inbounds float* %tmp4821, i64 1
+  %tmp4823 = getelementptr inbounds float* %tmp4822, i64 1
+  %tmp4824 = getelementptr inbounds float* %tmp4823, i64 1
+  %tmp4825 = getelementptr inbounds float* %tmp4824, i64 1
+  %tmp4826 = getelementptr inbounds float* %tmp4825, i64 1
+  %tmp4827 = getelementptr inbounds float* %tmp4826, i64 1
+  %tmp4828 = getelementptr inbounds float* %tmp4827, i64 1
+  %tmp4829 = getelementptr inbounds float* %tmp4828, i64 1
+  %tmp4830 = getelementptr inbounds float* %tmp4829, i64 1
+  %tmp4831 = getelementptr inbounds float* %tmp4830, i64 1
+  %tmp4832 = getelementptr inbounds float* %tmp4831, i64 1
+  %tmp4833 = getelementptr inbounds float* %tmp4832, i64 1
+  %tmp4834 = getelementptr inbounds float* %tmp4833, i64 1
+  %tmp4835 = getelementptr inbounds float* %tmp4834, i64 1
+  %tmp4836 = getelementptr inbounds float* %tmp4835, i64 1
+  %tmp4837 = getelementptr inbounds float* %tmp4836, i64 1
+  %tmp4838 = getelementptr inbounds float* %tmp4837, i64 1
+  %tmp4839 = getelementptr inbounds float* %tmp4838, i64 1
+  %tmp4840 = getelementptr inbounds float* %tmp4839, i64 1
+  %tmp4841 = getelementptr inbounds float* %tmp4840, i64 1
+  %tmp4842 = getelementptr inbounds float* %tmp4841, i64 1
+  %tmp4843 = getelementptr inbounds float* %tmp4842, i64 1
+  %tmp4844 = getelementptr inbounds float* %tmp4843, i64 1
+  %tmp4845 = getelementptr inbounds float* %tmp4844, i64 1
+  %tmp4846 = getelementptr inbounds float* %tmp4845, i64 1
+  %tmp4847 = getelementptr inbounds float* %tmp4846, i64 1
+  %tmp4848 = getelementptr inbounds float* %tmp4847, i64 1
+  %tmp4849 = getelementptr inbounds float* %tmp4848, i64 1
+  %tmp4850 = getelementptr inbounds float* %tmp4849, i64 1
+  %tmp4851 = getelementptr inbounds float* %tmp4850, i64 1
+  %tmp4852 = getelementptr inbounds float* %tmp4851, i64 1
+  %tmp4853 = getelementptr inbounds float* %tmp4852, i64 1
+  %tmp4854 = getelementptr inbounds float* %tmp4853, i64 1
+  %tmp4855 = getelementptr inbounds float* %tmp4854, i64 1
+  %tmp4856 = getelementptr inbounds float* %tmp4855, i64 1
+  %tmp4857 = getelementptr inbounds float* %tmp4856, i64 1
+  %tmp4858 = getelementptr inbounds float* %tmp4857, i64 1
+  %tmp4859 = getelementptr inbounds float* %tmp4858, i64 1
+  %tmp4860 = getelementptr inbounds float* %tmp4859, i64 1
+  %tmp4861 = getelementptr inbounds float* %tmp4860, i64 1
+  %tmp4862 = getelementptr inbounds float* %tmp4861, i64 1
+  %tmp4863 = getelementptr inbounds float* %tmp4862, i64 1
+  %tmp4864 = getelementptr inbounds float* %tmp4863, i64 1
+  %tmp4865 = getelementptr inbounds float* %tmp4864, i64 1
+  %tmp4866 = getelementptr inbounds float* %tmp4865, i64 1
+  %tmp4867 = getelementptr inbounds float* %tmp4866, i64 1
+  %tmp4868 = getelementptr inbounds float* %tmp4867, i64 1
+  %tmp4869 = getelementptr inbounds float* %tmp4868, i64 1
+  %tmp4870 = getelementptr inbounds float* %tmp4869, i64 1
+  %tmp4871 = getelementptr inbounds float* %tmp4870, i64 1
+  %tmp4872 = getelementptr inbounds float* %tmp4871, i64 1
+  %tmp4873 = getelementptr inbounds float* %tmp4872, i64 1
+  %tmp4874 = getelementptr inbounds float* %tmp4873, i64 1
+  %tmp4875 = getelementptr inbounds float* %tmp4874, i64 1
+  %tmp4876 = getelementptr inbounds float* %tmp4875, i64 1
+  %tmp4877 = getelementptr inbounds float* %tmp4876, i64 1
+  %tmp4878 = getelementptr inbounds float* %tmp4877, i64 1
+  %tmp4879 = getelementptr inbounds float* %tmp4878, i64 1
+  %tmp4880 = getelementptr inbounds float* %tmp4879, i64 1
+  %tmp4881 = getelementptr inbounds float* %tmp4880, i64 1
+  %tmp4882 = getelementptr inbounds float* %tmp4881, i64 1
+  %tmp4883 = getelementptr inbounds float* %tmp4882, i64 1
+  %tmp4884 = getelementptr inbounds float* %tmp4883, i64 1
+  %tmp4885 = getelementptr inbounds float* %tmp4884, i64 1
+  %tmp4886 = getelementptr inbounds float* %tmp4885, i64 1
+  %tmp4887 = getelementptr inbounds float* %tmp4886, i64 1
+  %tmp4888 = getelementptr inbounds float* %tmp4887, i64 1
+  %tmp4889 = getelementptr inbounds float* %tmp4888, i64 1
+  %tmp4890 = getelementptr inbounds float* %tmp4889, i64 1
+  %tmp4891 = getelementptr inbounds float* %tmp4890, i64 1
+  %tmp4892 = getelementptr inbounds float* %tmp4891, i64 1
+  %tmp4893 = getelementptr inbounds float* %tmp4892, i64 1
+  %tmp4894 = getelementptr inbounds float* %tmp4893, i64 1
+  %tmp4895 = getelementptr inbounds float* %tmp4894, i64 1
+  %tmp4896 = getelementptr inbounds float* %tmp4895, i64 1
+  %tmp4897 = getelementptr inbounds float* %tmp4896, i64 1
+  %tmp4898 = getelementptr inbounds float* %tmp4897, i64 1
+  %tmp4899 = getelementptr inbounds float* %tmp4898, i64 1
+  %tmp4900 = getelementptr inbounds float* %tmp4899, i64 1
+  %tmp4901 = getelementptr inbounds float* %tmp4900, i64 1
+  %tmp4902 = getelementptr inbounds float* %tmp4901, i64 1
+  %tmp4903 = getelementptr inbounds float* %tmp4902, i64 1
+  %tmp4904 = getelementptr inbounds float* %tmp4903, i64 1
+  %tmp4905 = getelementptr inbounds float* %tmp4904, i64 1
+  %tmp4906 = getelementptr inbounds float* %tmp4905, i64 1
+  %tmp4907 = getelementptr inbounds float* %tmp4906, i64 1
+  %tmp4908 = getelementptr inbounds float* %tmp4907, i64 1
+  %tmp4909 = getelementptr inbounds float* %tmp4908, i64 1
+  %tmp4910 = getelementptr inbounds float* %tmp4909, i64 1
+  %tmp4911 = getelementptr inbounds float* %tmp4910, i64 1
+  %tmp4912 = getelementptr inbounds float* %tmp4911, i64 1
+  %tmp4913 = getelementptr inbounds float* %tmp4912, i64 1
+  %tmp4914 = getelementptr inbounds float* %tmp4913, i64 1
+  %tmp4915 = getelementptr inbounds float* %tmp4914, i64 1
+  %tmp4916 = getelementptr inbounds float* %tmp4915, i64 1
+  %tmp4917 = getelementptr inbounds float* %tmp4916, i64 1
+  %tmp4918 = getelementptr inbounds float* %tmp4917, i64 1
+  %tmp4919 = getelementptr inbounds float* %tmp4918, i64 1
+  %tmp4920 = getelementptr inbounds float* %tmp4919, i64 1
+  %tmp4921 = getelementptr inbounds float* %tmp4920, i64 1
+  %tmp4922 = getelementptr inbounds float* %tmp4921, i64 1
+  %tmp4923 = getelementptr inbounds float* %tmp4922, i64 1
+  %tmp4924 = getelementptr inbounds float* %tmp4923, i64 1
+  %tmp4925 = getelementptr inbounds float* %tmp4924, i64 1
+  %tmp4926 = getelementptr inbounds float* %tmp4925, i64 1
+  %tmp4927 = getelementptr inbounds float* %tmp4926, i64 1
+  %tmp4928 = getelementptr inbounds float* %tmp4927, i64 1
+  %tmp4929 = getelementptr inbounds float* %tmp4928, i64 1
+  %tmp4930 = getelementptr inbounds float* %tmp4929, i64 1
+  %tmp4931 = getelementptr inbounds float* %tmp4930, i64 1
+  %tmp4932 = getelementptr inbounds float* %tmp4931, i64 1
+  %tmp4933 = getelementptr inbounds float* %tmp4932, i64 1
+  %tmp4934 = getelementptr inbounds float* %tmp4933, i64 1
+  %tmp4935 = getelementptr inbounds float* %tmp4934, i64 1
+  %tmp4936 = getelementptr inbounds float* %tmp4935, i64 1
+  %tmp4937 = getelementptr inbounds float* %tmp4936, i64 1
+  %tmp4938 = getelementptr inbounds float* %tmp4937, i64 1
+  %tmp4939 = getelementptr inbounds float* %tmp4938, i64 1
+  %tmp4940 = getelementptr inbounds float* %tmp4939, i64 1
+  %tmp4941 = getelementptr inbounds float* %tmp4940, i64 1
+  %tmp4942 = getelementptr inbounds float* %tmp4941, i64 1
+  %tmp4943 = getelementptr inbounds float* %tmp4942, i64 1
+  %tmp4944 = getelementptr inbounds float* %tmp4943, i64 1
+  %tmp4945 = getelementptr inbounds float* %tmp4944, i64 1
+  %tmp4946 = getelementptr inbounds float* %tmp4945, i64 1
+  %tmp4947 = getelementptr inbounds float* %tmp4946, i64 1
+  %tmp4948 = getelementptr inbounds float* %tmp4947, i64 1
+  %tmp4949 = getelementptr inbounds float* %tmp4948, i64 1
+  %tmp4950 = getelementptr inbounds float* %tmp4949, i64 1
+  %tmp4951 = getelementptr inbounds float* %tmp4950, i64 1
+  %tmp4952 = getelementptr inbounds float* %tmp4951, i64 1
+  %tmp4953 = getelementptr inbounds float* %tmp4952, i64 1
+  %tmp4954 = getelementptr inbounds float* %tmp4953, i64 1
+  %tmp4955 = getelementptr inbounds float* %tmp4954, i64 1
+  %tmp4956 = getelementptr inbounds float* %tmp4955, i64 1
+  %tmp4957 = getelementptr inbounds float* %tmp4956, i64 1
+  %tmp4958 = getelementptr inbounds float* %tmp4957, i64 1
+  %tmp4959 = getelementptr inbounds float* %tmp4958, i64 1
+  %tmp4960 = getelementptr inbounds float* %tmp4959, i64 1
+  %tmp4961 = getelementptr inbounds float* %tmp4960, i64 1
+  %tmp4962 = getelementptr inbounds float* %tmp4961, i64 1
+  %tmp4963 = getelementptr inbounds float* %tmp4962, i64 1
+  %tmp4964 = getelementptr inbounds float* %tmp4963, i64 1
+  %tmp4965 = getelementptr inbounds float* %tmp4964, i64 1
+  %tmp4966 = getelementptr inbounds float* %tmp4965, i64 1
+  %tmp4967 = getelementptr inbounds float* %tmp4966, i64 1
+  %tmp4968 = getelementptr inbounds float* %tmp4967, i64 1
+  %tmp4969 = getelementptr inbounds float* %tmp4968, i64 1
+  %tmp4970 = getelementptr inbounds float* %tmp4969, i64 1
+  %tmp4971 = getelementptr inbounds float* %tmp4970, i64 1
+  %tmp4972 = getelementptr inbounds float* %tmp4971, i64 1
+  %tmp4973 = getelementptr inbounds float* %tmp4972, i64 1
+  %tmp4974 = getelementptr inbounds float* %tmp4973, i64 1
+  %tmp4975 = getelementptr inbounds float* %tmp4974, i64 1
+  %tmp4976 = getelementptr inbounds float* %tmp4975, i64 1
+  %tmp4977 = getelementptr inbounds float* %tmp4976, i64 1
+  %tmp4978 = getelementptr inbounds float* %tmp4977, i64 1
+  %tmp4979 = getelementptr inbounds float* %tmp4978, i64 1
+  %tmp4980 = getelementptr inbounds float* %tmp4979, i64 1
+  %tmp4981 = getelementptr inbounds float* %tmp4980, i64 1
+  %tmp4982 = getelementptr inbounds float* %tmp4981, i64 1
+  %tmp4983 = getelementptr inbounds float* %tmp4982, i64 1
+  %tmp4984 = getelementptr inbounds float* %tmp4983, i64 1
+  %tmp4985 = getelementptr inbounds float* %tmp4984, i64 1
+  %tmp4986 = getelementptr inbounds float* %tmp4985, i64 1
+  %tmp4987 = getelementptr inbounds float* %tmp4986, i64 1
+  %tmp4988 = getelementptr inbounds float* %tmp4987, i64 1
+  %tmp4989 = getelementptr inbounds float* %tmp4988, i64 1
+  %tmp4990 = getelementptr inbounds float* %tmp4989, i64 1
+  %tmp4991 = getelementptr inbounds float* %tmp4990, i64 1
+  %tmp4992 = getelementptr inbounds float* %tmp4991, i64 1
+  %tmp4993 = getelementptr inbounds float* %tmp4992, i64 1
+  %tmp4994 = getelementptr inbounds float* %tmp4993, i64 1
+  %tmp4995 = getelementptr inbounds float* %tmp4994, i64 1
+  %tmp4996 = getelementptr inbounds float* %tmp4995, i64 1
+  %tmp4997 = getelementptr inbounds float* %tmp4996, i64 1
+  %tmp4998 = getelementptr inbounds float* %tmp4997, i64 1
+  %tmp4999 = getelementptr inbounds float* %tmp4998, i64 1
+  %tmp5000 = getelementptr inbounds float* %tmp4999, i64 1
+  %tmp5001 = getelementptr inbounds float* %tmp5000, i64 1
+  %tmp5002 = getelementptr inbounds float* %tmp5001, i64 1
+  %tmp5003 = getelementptr inbounds float* %tmp5002, i64 1
+  %tmp5004 = getelementptr inbounds float* %tmp5003, i64 1
+  %tmp5005 = getelementptr inbounds float* %tmp5004, i64 1
+  %tmp5006 = getelementptr inbounds float* %tmp5005, i64 1
+  %tmp5007 = getelementptr inbounds float* %tmp5006, i64 1
+  %tmp5008 = getelementptr inbounds float* %tmp5007, i64 1
+  %tmp5009 = getelementptr inbounds float* %tmp5008, i64 1
+  %tmp5010 = getelementptr inbounds float* %tmp5009, i64 1
+  %tmp5011 = getelementptr inbounds float* %tmp5010, i64 1
+  %tmp5012 = getelementptr inbounds float* %tmp5011, i64 1
+  %tmp5013 = getelementptr inbounds float* %tmp5012, i64 1
+  %tmp5014 = getelementptr inbounds float* %tmp5013, i64 1
+  %tmp5015 = getelementptr inbounds float* %tmp5014, i64 1
+  %tmp5016 = getelementptr inbounds float* %tmp5015, i64 1
+  %tmp5017 = getelementptr inbounds float* %tmp5016, i64 1
+  %tmp5018 = getelementptr inbounds float* %tmp5017, i64 1
+  %tmp5019 = getelementptr inbounds float* %tmp5018, i64 1
+  %tmp5020 = getelementptr inbounds float* %tmp5019, i64 1
+  %tmp5021 = getelementptr inbounds float* %tmp5020, i64 1
+  %tmp5022 = getelementptr inbounds float* %tmp5021, i64 1
+  %tmp5023 = getelementptr inbounds float* %tmp5022, i64 1
+  %tmp5024 = getelementptr inbounds float* %tmp5023, i64 1
+  %tmp5025 = getelementptr inbounds float* %tmp5024, i64 1
+  %tmp5026 = getelementptr inbounds float* %tmp5025, i64 1
+  %tmp5027 = getelementptr inbounds float* %tmp5026, i64 1
+  %tmp5028 = getelementptr inbounds float* %tmp5027, i64 1
+  %tmp5029 = getelementptr inbounds float* %tmp5028, i64 1
+  %tmp5030 = getelementptr inbounds float* %tmp5029, i64 1
+  %tmp5031 = getelementptr inbounds float* %tmp5030, i64 1
+  %tmp5032 = getelementptr inbounds float* %tmp5031, i64 1
+  %tmp5033 = getelementptr inbounds float* %tmp5032, i64 1
+  %tmp5034 = getelementptr inbounds float* %tmp5033, i64 1
+  %tmp5035 = getelementptr inbounds float* %tmp5034, i64 1
+  %tmp5036 = getelementptr inbounds float* %tmp5035, i64 1
+  %tmp5037 = getelementptr inbounds float* %tmp5036, i64 1
+  %tmp5038 = getelementptr inbounds float* %tmp5037, i64 1
+  %tmp5039 = getelementptr inbounds float* %tmp5038, i64 1
+  %tmp5040 = getelementptr inbounds float* %tmp5039, i64 1
+  %tmp5041 = getelementptr inbounds float* %tmp5040, i64 1
+  %tmp5042 = getelementptr inbounds float* %tmp5041, i64 1
+  %tmp5043 = getelementptr inbounds float* %tmp5042, i64 1
+  %tmp5044 = getelementptr inbounds float* %tmp5043, i64 1
+  %tmp5045 = getelementptr inbounds float* %tmp5044, i64 1
+  %tmp5046 = getelementptr inbounds float* %tmp5045, i64 1
+  %tmp5047 = getelementptr inbounds float* %tmp5046, i64 1
+  %tmp5048 = getelementptr inbounds float* %tmp5047, i64 1
+  %tmp5049 = getelementptr inbounds float* %tmp5048, i64 1
+  %tmp5050 = getelementptr inbounds float* %tmp5049, i64 1
+  %tmp5051 = getelementptr inbounds float* %tmp5050, i64 1
+  %tmp5052 = getelementptr inbounds float* %tmp5051, i64 1
+  %tmp5053 = getelementptr inbounds float* %tmp5052, i64 1
+  %tmp5054 = getelementptr inbounds float* %tmp5053, i64 1
+  %tmp5055 = getelementptr inbounds float* %tmp5054, i64 1
+  %tmp5056 = getelementptr inbounds float* %tmp5055, i64 1
+  %tmp5057 = getelementptr inbounds float* %tmp5056, i64 1
+  %tmp5058 = getelementptr inbounds float* %tmp5057, i64 1
+  %tmp5059 = getelementptr inbounds float* %tmp5058, i64 1
+  %tmp5060 = getelementptr inbounds float* %tmp5059, i64 1
+  %tmp5061 = getelementptr inbounds float* %tmp5060, i64 1
+  %tmp5062 = getelementptr inbounds float* %tmp5061, i64 1
+  %tmp5063 = getelementptr inbounds float* %tmp5062, i64 1
+  %tmp5064 = getelementptr inbounds float* %tmp5063, i64 1
+  %tmp5065 = getelementptr inbounds float* %tmp5064, i64 1
+  %tmp5066 = getelementptr inbounds float* %tmp5065, i64 1
+  %tmp5067 = getelementptr inbounds float* %tmp5066, i64 1
+  %tmp5068 = getelementptr inbounds float* %tmp5067, i64 1
+  %tmp5069 = getelementptr inbounds float* %tmp5068, i64 1
+  %tmp5070 = getelementptr inbounds float* %tmp5069, i64 1
+  %tmp5071 = getelementptr inbounds float* %tmp5070, i64 1
+  %tmp5072 = getelementptr inbounds float* %tmp5071, i64 1
+  %tmp5073 = getelementptr inbounds float* %tmp5072, i64 1
+  %tmp5074 = getelementptr inbounds float* %tmp5073, i64 1
+  %tmp5075 = getelementptr inbounds float* %tmp5074, i64 1
+  %tmp5076 = getelementptr inbounds float* %tmp5075, i64 1
+  %tmp5077 = getelementptr inbounds float* %tmp5076, i64 1
+  %tmp5078 = getelementptr inbounds float* %tmp5077, i64 1
+  %tmp5079 = getelementptr inbounds float* %tmp5078, i64 1
+  %tmp5080 = getelementptr inbounds float* %tmp5079, i64 1
+  %tmp5081 = getelementptr inbounds float* %tmp5080, i64 1
+  %tmp5082 = getelementptr inbounds float* %tmp5081, i64 1
+  %tmp5083 = getelementptr inbounds float* %tmp5082, i64 1
+  %tmp5084 = getelementptr inbounds float* %tmp5083, i64 1
+  %tmp5085 = getelementptr inbounds float* %tmp5084, i64 1
+  %tmp5086 = getelementptr inbounds float* %tmp5085, i64 1
+  %tmp5087 = getelementptr inbounds float* %tmp5086, i64 1
+  %tmp5088 = getelementptr inbounds float* %tmp5087, i64 1
+  %tmp5089 = getelementptr inbounds float* %tmp5088, i64 1
+  %tmp5090 = getelementptr inbounds float* %tmp5089, i64 1
+  %tmp5091 = getelementptr inbounds float* %tmp5090, i64 1
+  %tmp5092 = getelementptr inbounds float* %tmp5091, i64 1
+  %tmp5093 = getelementptr inbounds float* %tmp5092, i64 1
+  %tmp5094 = getelementptr inbounds float* %tmp5093, i64 1
+  %tmp5095 = getelementptr inbounds float* %tmp5094, i64 1
+  %tmp5096 = getelementptr inbounds float* %tmp5095, i64 1
+  %tmp5097 = getelementptr inbounds float* %tmp5096, i64 1
+  %tmp5098 = getelementptr inbounds float* %tmp5097, i64 1
+  %tmp5099 = getelementptr inbounds float* %tmp5098, i64 1
+  %tmp5100 = getelementptr inbounds float* %tmp5099, i64 1
+  %tmp5101 = getelementptr inbounds float* %tmp5100, i64 1
+  %tmp5102 = getelementptr inbounds float* %tmp5101, i64 1
+  %tmp5103 = getelementptr inbounds float* %tmp5102, i64 1
+  %tmp5104 = getelementptr inbounds float* %tmp5103, i64 1
+  %tmp5105 = getelementptr inbounds float* %tmp5104, i64 1
+  %tmp5106 = getelementptr inbounds float* %tmp5105, i64 1
+  %tmp5107 = getelementptr inbounds float* %tmp5106, i64 1
+  %tmp5108 = getelementptr inbounds float* %tmp5107, i64 1
+  %tmp5109 = getelementptr inbounds float* %tmp5108, i64 1
+  %tmp5110 = getelementptr inbounds float* %tmp5109, i64 1
+  %tmp5111 = getelementptr inbounds float* %tmp5110, i64 1
+  %tmp5112 = getelementptr inbounds float* %tmp5111, i64 1
+  %tmp5113 = getelementptr inbounds float* %tmp5112, i64 1
+  %tmp5114 = getelementptr inbounds float* %tmp5113, i64 1
+  %tmp5115 = getelementptr inbounds float* %tmp5114, i64 1
+  %tmp5116 = getelementptr inbounds float* %tmp5115, i64 1
+  %tmp5117 = getelementptr inbounds float* %tmp5116, i64 1
+  %tmp5118 = getelementptr inbounds float* %tmp5117, i64 1
+  %tmp5119 = getelementptr inbounds float* %tmp5118, i64 1
+  %tmp5120 = getelementptr inbounds float* %tmp5119, i64 1
+  %tmp5121 = getelementptr inbounds float* %tmp5120, i64 1
+  %tmp5122 = getelementptr inbounds float* %tmp5121, i64 1
+  %tmp5123 = getelementptr inbounds float* %tmp5122, i64 1
+  %tmp5124 = getelementptr inbounds float* %tmp5123, i64 1
+  %tmp5125 = getelementptr inbounds float* %tmp5124, i64 1
+  %tmp5126 = getelementptr inbounds float* %tmp5125, i64 1
+  %tmp5127 = getelementptr inbounds float* %tmp5126, i64 1
+  %tmp5128 = getelementptr inbounds float* %tmp5127, i64 1
+  %tmp5129 = getelementptr inbounds float* %tmp5128, i64 1
+  %tmp5130 = getelementptr inbounds float* %tmp5129, i64 1
+  %tmp5131 = getelementptr inbounds float* %tmp5130, i64 1
+  %tmp5132 = getelementptr inbounds float* %tmp5131, i64 1
+  %tmp5133 = getelementptr inbounds float* %tmp5132, i64 1
+  %tmp5134 = getelementptr inbounds float* %tmp5133, i64 1
+  %tmp5135 = getelementptr inbounds float* %tmp5134, i64 1
+  %tmp5136 = getelementptr inbounds float* %tmp5135, i64 1
+  %tmp5137 = getelementptr inbounds float* %tmp5136, i64 1
+  %tmp5138 = getelementptr inbounds float* %tmp5137, i64 1
+  %tmp5139 = getelementptr inbounds float* %tmp5138, i64 1
+  %tmp5140 = getelementptr inbounds float* %tmp5139, i64 1
+  %tmp5141 = getelementptr inbounds float* %tmp5140, i64 1
+  %tmp5142 = getelementptr inbounds float* %tmp5141, i64 1
+  %tmp5143 = getelementptr inbounds float* %tmp5142, i64 1
+  %tmp5144 = getelementptr inbounds float* %tmp5143, i64 1
+  %tmp5145 = getelementptr inbounds float* %tmp5144, i64 1
+  %tmp5146 = getelementptr inbounds float* %tmp5145, i64 1
+  %tmp5147 = getelementptr inbounds float* %tmp5146, i64 1
+  %tmp5148 = getelementptr inbounds float* %tmp5147, i64 1
+  %tmp5149 = getelementptr inbounds float* %tmp5148, i64 1
+  %tmp5150 = getelementptr inbounds float* %tmp5149, i64 1
+  %tmp5151 = getelementptr inbounds float* %tmp5150, i64 1
+  %tmp5152 = getelementptr inbounds float* %tmp5151, i64 1
+  %tmp5153 = getelementptr inbounds float* %tmp5152, i64 1
+  %tmp5154 = getelementptr inbounds float* %tmp5153, i64 1
+  %tmp5155 = getelementptr inbounds float* %tmp5154, i64 1
+  %tmp5156 = getelementptr inbounds float* %tmp5155, i64 1
+  %tmp5157 = getelementptr inbounds float* %tmp5156, i64 1
+  %tmp5158 = getelementptr inbounds float* %tmp5157, i64 1
+  %tmp5159 = getelementptr inbounds float* %tmp5158, i64 1
+  %tmp5160 = getelementptr inbounds float* %tmp5159, i64 1
+  %tmp5161 = getelementptr inbounds float* %tmp5160, i64 1
+  %tmp5162 = getelementptr inbounds float* %tmp5161, i64 1
+  %tmp5163 = getelementptr inbounds float* %tmp5162, i64 1
+  %tmp5164 = getelementptr inbounds float* %tmp5163, i64 1
+  %tmp5165 = getelementptr inbounds float* %tmp5164, i64 1
+  %tmp5166 = getelementptr inbounds float* %tmp5165, i64 1
+  %tmp5167 = getelementptr inbounds float* %tmp5166, i64 1
+  %tmp5168 = getelementptr inbounds float* %tmp5167, i64 1
+  %tmp5169 = getelementptr inbounds float* %tmp5168, i64 1
+  %tmp5170 = getelementptr inbounds float* %tmp5169, i64 1
+  %tmp5171 = getelementptr inbounds float* %tmp5170, i64 1
+  %tmp5172 = getelementptr inbounds float* %tmp5171, i64 1
+  %tmp5173 = getelementptr inbounds float* %tmp5172, i64 1
+  %tmp5174 = getelementptr inbounds float* %tmp5173, i64 1
+  %tmp5175 = getelementptr inbounds float* %tmp5174, i64 1
+  %tmp5176 = getelementptr inbounds float* %tmp5175, i64 1
+  %tmp5177 = getelementptr inbounds float* %tmp5176, i64 1
+  %tmp5178 = getelementptr inbounds float* %tmp5177, i64 1
+  %tmp5179 = getelementptr inbounds float* %tmp5178, i64 1
+  %tmp5180 = getelementptr inbounds float* %tmp5179, i64 1
+  %tmp5181 = getelementptr inbounds float* %tmp5180, i64 1
+  %tmp5182 = getelementptr inbounds float* %tmp5181, i64 1
+  %tmp5183 = getelementptr inbounds float* %tmp5182, i64 1
+  %tmp5184 = getelementptr inbounds float* %tmp5183, i64 1
+  %tmp5185 = getelementptr inbounds float* %tmp5184, i64 1
+  %tmp5186 = getelementptr inbounds float* %tmp5185, i64 1
+  %tmp5187 = getelementptr inbounds float* %tmp5186, i64 1
+  %tmp5188 = getelementptr inbounds float* %tmp5187, i64 1
+  %tmp5189 = getelementptr inbounds float* %tmp5188, i64 1
+  %tmp5190 = getelementptr inbounds float* %tmp5189, i64 1
+  %tmp5191 = getelementptr inbounds float* %tmp5190, i64 1
+  %tmp5192 = getelementptr inbounds float* %tmp5191, i64 1
+  %tmp5193 = getelementptr inbounds float* %tmp5192, i64 1
+  %tmp5194 = getelementptr inbounds float* %tmp5193, i64 1
+  %tmp5195 = getelementptr inbounds float* %tmp5194, i64 1
+  %tmp5196 = getelementptr inbounds float* %tmp5195, i64 1
+  %tmp5197 = getelementptr inbounds float* %tmp5196, i64 1
+  %tmp5198 = getelementptr inbounds float* %tmp5197, i64 1
+  %tmp5199 = getelementptr inbounds float* %tmp5198, i64 1
+  %tmp5200 = getelementptr inbounds float* %tmp5199, i64 1
+  %tmp5201 = getelementptr inbounds float* %tmp5200, i64 1
+  %tmp5202 = getelementptr inbounds float* %tmp5201, i64 1
+  %tmp5203 = getelementptr inbounds float* %tmp5202, i64 1
+  %tmp5204 = getelementptr inbounds float* %tmp5203, i64 1
+  %tmp5205 = getelementptr inbounds float* %tmp5204, i64 1
+  %tmp5206 = getelementptr inbounds float* %tmp5205, i64 1
+  %tmp5207 = getelementptr inbounds float* %tmp5206, i64 1
+  %tmp5208 = getelementptr inbounds float* %tmp5207, i64 1
+  %tmp5209 = getelementptr inbounds float* %tmp5208, i64 1
+  %tmp5210 = getelementptr inbounds float* %tmp5209, i64 1
+  %tmp5211 = getelementptr inbounds float* %tmp5210, i64 1
+  %tmp5212 = getelementptr inbounds float* %tmp5211, i64 1
+  %tmp5213 = getelementptr inbounds float* %tmp5212, i64 1
+  %tmp5214 = getelementptr inbounds float* %tmp5213, i64 1
+  %tmp5215 = getelementptr inbounds float* %tmp5214, i64 1
+  %tmp5216 = getelementptr inbounds float* %tmp5215, i64 1
+  %tmp5217 = getelementptr inbounds float* %tmp5216, i64 1
+  %tmp5218 = getelementptr inbounds float* %tmp5217, i64 1
+  %tmp5219 = getelementptr inbounds float* %tmp5218, i64 1
+  %tmp5220 = getelementptr inbounds float* %tmp5219, i64 1
+  %tmp5221 = getelementptr inbounds float* %tmp5220, i64 1
+  %tmp5222 = getelementptr inbounds float* %tmp5221, i64 1
+  %tmp5223 = getelementptr inbounds float* %tmp5222, i64 1
+  %tmp5224 = getelementptr inbounds float* %tmp5223, i64 1
+  %tmp5225 = getelementptr inbounds float* %tmp5224, i64 1
+  %tmp5226 = getelementptr inbounds float* %tmp5225, i64 1
+  %tmp5227 = getelementptr inbounds float* %tmp5226, i64 1
+  %tmp5228 = getelementptr inbounds float* %tmp5227, i64 1
+  %tmp5229 = getelementptr inbounds float* %tmp5228, i64 1
+  %tmp5230 = getelementptr inbounds float* %tmp5229, i64 1
+  %tmp5231 = getelementptr inbounds float* %tmp5230, i64 1
+  %tmp5232 = getelementptr inbounds float* %tmp5231, i64 1
+  %tmp5233 = getelementptr inbounds float* %tmp5232, i64 1
+  %tmp5234 = getelementptr inbounds float* %tmp5233, i64 1
+  %tmp5235 = getelementptr inbounds float* %tmp5234, i64 1
+  %tmp5236 = getelementptr inbounds float* %tmp5235, i64 1
+  %tmp5237 = getelementptr inbounds float* %tmp5236, i64 1
+  %tmp5238 = getelementptr inbounds float* %tmp5237, i64 1
+  %tmp5239 = getelementptr inbounds float* %tmp5238, i64 1
+  %tmp5240 = getelementptr inbounds float* %tmp5239, i64 1
+  %tmp5241 = getelementptr inbounds float* %tmp5240, i64 1
+  %tmp5242 = getelementptr inbounds float* %tmp5241, i64 1
+  %tmp5243 = getelementptr inbounds float* %tmp5242, i64 1
+  %tmp5244 = getelementptr inbounds float* %tmp5243, i64 1
+  %tmp5245 = getelementptr inbounds float* %tmp5244, i64 1
+  %tmp5246 = getelementptr inbounds float* %tmp5245, i64 1
+  %tmp5247 = getelementptr inbounds float* %tmp5246, i64 1
+  %tmp5248 = getelementptr inbounds float* %tmp5247, i64 1
+  %tmp5249 = getelementptr inbounds float* %tmp5248, i64 1
+  %tmp5250 = getelementptr inbounds float* %tmp5249, i64 1
+  %tmp5251 = getelementptr inbounds float* %tmp5250, i64 1
+  %tmp5252 = getelementptr inbounds float* %tmp5251, i64 1
+  %tmp5253 = getelementptr inbounds float* %tmp5252, i64 1
+  %tmp5254 = getelementptr inbounds float* %tmp5253, i64 1
+  %tmp5255 = getelementptr inbounds float* %tmp5254, i64 1
+  %tmp5256 = getelementptr inbounds float* %tmp5255, i64 1
+  %tmp5257 = getelementptr inbounds float* %tmp5256, i64 1
+  %tmp5258 = getelementptr inbounds float* %tmp5257, i64 1
+  %tmp5259 = getelementptr inbounds float* %tmp5258, i64 1
+  %tmp5260 = getelementptr inbounds float* %tmp5259, i64 1
+  %tmp5261 = getelementptr inbounds float* %tmp5260, i64 1
+  %tmp5262 = getelementptr inbounds float* %tmp5261, i64 1
+  %tmp5263 = getelementptr inbounds float* %tmp5262, i64 1
+  %tmp5264 = getelementptr inbounds float* %tmp5263, i64 1
+  %tmp5265 = getelementptr inbounds float* %tmp5264, i64 1
+  %tmp5266 = getelementptr inbounds float* %tmp5265, i64 1
+  %tmp5267 = getelementptr inbounds float* %tmp5266, i64 1
+  %tmp5268 = getelementptr inbounds float* %tmp5267, i64 1
+  %tmp5269 = getelementptr inbounds float* %tmp5268, i64 1
+  %tmp5270 = getelementptr inbounds float* %tmp5269, i64 1
+  %tmp5271 = getelementptr inbounds float* %tmp5270, i64 1
+  %tmp5272 = getelementptr inbounds float* %tmp5271, i64 1
+  %tmp5273 = getelementptr inbounds float* %tmp5272, i64 1
+  %tmp5274 = getelementptr inbounds float* %tmp5273, i64 1
+  %tmp5275 = getelementptr inbounds float* %tmp5274, i64 1
+  %tmp5276 = getelementptr inbounds float* %tmp5275, i64 1
+  %tmp5277 = getelementptr inbounds float* %tmp5276, i64 1
+  %tmp5278 = getelementptr inbounds float* %tmp5277, i64 1
+  %tmp5279 = getelementptr inbounds float* %tmp5278, i64 1
+  %tmp5280 = getelementptr inbounds float* %tmp5279, i64 1
+  %tmp5281 = getelementptr inbounds float* %tmp5280, i64 1
+  %tmp5282 = getelementptr inbounds float* %tmp5281, i64 1
+  %tmp5283 = getelementptr inbounds float* %tmp5282, i64 1
+  %tmp5284 = getelementptr inbounds float* %tmp5283, i64 1
+  %tmp5285 = getelementptr inbounds float* %tmp5284, i64 1
+  %tmp5286 = getelementptr inbounds float* %tmp5285, i64 1
+  %tmp5287 = getelementptr inbounds float* %tmp5286, i64 1
+  %tmp5288 = getelementptr inbounds float* %tmp5287, i64 1
+  %tmp5289 = getelementptr inbounds float* %tmp5288, i64 1
+  %tmp5290 = getelementptr inbounds float* %tmp5289, i64 1
+  %tmp5291 = getelementptr inbounds float* %tmp5290, i64 1
+  %tmp5292 = getelementptr inbounds float* %tmp5291, i64 1
+  %tmp5293 = getelementptr inbounds float* %tmp5292, i64 1
+  %tmp5294 = getelementptr inbounds float* %tmp5293, i64 1
+  %tmp5295 = getelementptr inbounds float* %tmp5294, i64 1
+  %tmp5296 = getelementptr inbounds float* %tmp5295, i64 1
+  %tmp5297 = getelementptr inbounds float* %tmp5296, i64 1
+  %tmp5298 = getelementptr inbounds float* %tmp5297, i64 1
+  %tmp5299 = getelementptr inbounds float* %tmp5298, i64 1
+  %tmp5300 = getelementptr inbounds float* %tmp5299, i64 1
+  %tmp5301 = getelementptr inbounds float* %tmp5300, i64 1
+  %tmp5302 = getelementptr inbounds float* %tmp5301, i64 1
+  %tmp5303 = getelementptr inbounds float* %tmp5302, i64 1
+  %tmp5304 = getelementptr inbounds float* %tmp5303, i64 1
+  %tmp5305 = getelementptr inbounds float* %tmp5304, i64 1
+  %tmp5306 = getelementptr inbounds float* %tmp5305, i64 1
+  %tmp5307 = getelementptr inbounds float* %tmp5306, i64 1
+  %tmp5308 = getelementptr inbounds float* %tmp5307, i64 1
+  %tmp5309 = getelementptr inbounds float* %tmp5308, i64 1
+  %tmp5310 = getelementptr inbounds float* %tmp5309, i64 1
+  %tmp5311 = getelementptr inbounds float* %tmp5310, i64 1
+  %tmp5312 = getelementptr inbounds float* %tmp5311, i64 1
+  %tmp5313 = getelementptr inbounds float* %tmp5312, i64 1
+  %tmp5314 = getelementptr inbounds float* %tmp5313, i64 1
+  %tmp5315 = getelementptr inbounds float* %tmp5314, i64 1
+  %tmp5316 = getelementptr inbounds float* %tmp5315, i64 1
+  %tmp5317 = getelementptr inbounds float* %tmp5316, i64 1
+  %tmp5318 = getelementptr inbounds float* %tmp5317, i64 1
+  %tmp5319 = getelementptr inbounds float* %tmp5318, i64 1
+  %tmp5320 = getelementptr inbounds float* %tmp5319, i64 1
+  %tmp5321 = getelementptr inbounds float* %tmp5320, i64 1
+  %tmp5322 = getelementptr inbounds float* %tmp5321, i64 1
+  %tmp5323 = getelementptr inbounds float* %tmp5322, i64 1
+  %tmp5324 = getelementptr inbounds float* %tmp5323, i64 1
+  %tmp5325 = getelementptr inbounds float* %tmp5324, i64 1
+  %tmp5326 = getelementptr inbounds float* %tmp5325, i64 1
+  %tmp5327 = getelementptr inbounds float* %tmp5326, i64 1
+  %tmp5328 = getelementptr inbounds float* %tmp5327, i64 1
+  %tmp5329 = getelementptr inbounds float* %tmp5328, i64 1
+  %tmp5330 = getelementptr inbounds float* %tmp5329, i64 1
+  %tmp5331 = getelementptr inbounds float* %tmp5330, i64 1
+  %tmp5332 = getelementptr inbounds float* %tmp5331, i64 1
+  %tmp5333 = getelementptr inbounds float* %tmp5332, i64 1
+  %tmp5334 = getelementptr inbounds float* %tmp5333, i64 1
+  %tmp5335 = getelementptr inbounds float* %tmp5334, i64 1
+  %tmp5336 = getelementptr inbounds float* %tmp5335, i64 1
+  %tmp5337 = getelementptr inbounds float* %tmp5336, i64 1
+  %tmp5338 = getelementptr inbounds float* %tmp5337, i64 1
+  %tmp5339 = getelementptr inbounds float* %tmp5338, i64 1
+  %tmp5340 = getelementptr inbounds float* %tmp5339, i64 1
+  %tmp5341 = getelementptr inbounds float* %tmp5340, i64 1
+  %tmp5342 = getelementptr inbounds float* %tmp5341, i64 1
+  %tmp5343 = getelementptr inbounds float* %tmp5342, i64 1
+  %tmp5344 = getelementptr inbounds float* %tmp5343, i64 1
+  %tmp5345 = getelementptr inbounds float* %tmp5344, i64 1
+  %tmp5346 = getelementptr inbounds float* %tmp5345, i64 1
+  %tmp5347 = getelementptr inbounds float* %tmp5346, i64 1
+  %tmp5348 = getelementptr inbounds float* %tmp5347, i64 1
+  %tmp5349 = getelementptr inbounds float* %tmp5348, i64 1
+  %tmp5350 = getelementptr inbounds float* %tmp5349, i64 1
+  %tmp5351 = getelementptr inbounds float* %tmp5350, i64 1
+  %tmp5352 = getelementptr inbounds float* %tmp5351, i64 1
+  %tmp5353 = getelementptr inbounds float* %tmp5352, i64 1
+  %tmp5354 = getelementptr inbounds float* %tmp5353, i64 1
+  %tmp5355 = getelementptr inbounds float* %tmp5354, i64 1
+  %tmp5356 = getelementptr inbounds float* %tmp5355, i64 1
+  %tmp5357 = getelementptr inbounds float* %tmp5356, i64 1
+  %tmp5358 = getelementptr inbounds float* %tmp5357, i64 1
+  %tmp5359 = getelementptr inbounds float* %tmp5358, i64 1
+  %tmp5360 = getelementptr inbounds float* %tmp5359, i64 1
+  %tmp5361 = getelementptr inbounds float* %tmp5360, i64 1
+  %tmp5362 = getelementptr inbounds float* %tmp5361, i64 1
+  %tmp5363 = getelementptr inbounds float* %tmp5362, i64 1
+  %tmp5364 = getelementptr inbounds float* %tmp5363, i64 1
+  %tmp5365 = getelementptr inbounds float* %tmp5364, i64 1
+  %tmp5366 = getelementptr inbounds float* %tmp5365, i64 1
+  %tmp5367 = getelementptr inbounds float* %tmp5366, i64 1
+  %tmp5368 = getelementptr inbounds float* %tmp5367, i64 1
+  %tmp5369 = getelementptr inbounds float* %tmp5368, i64 1
+  %tmp5370 = getelementptr inbounds float* %tmp5369, i64 1
+  %tmp5371 = getelementptr inbounds float* %tmp5370, i64 1
+  %tmp5372 = getelementptr inbounds float* %tmp5371, i64 1
+  %tmp5373 = getelementptr inbounds float* %tmp5372, i64 1
+  %tmp5374 = getelementptr inbounds float* %tmp5373, i64 1
+  %tmp5375 = getelementptr inbounds float* %tmp5374, i64 1
+  %tmp5376 = getelementptr inbounds float* %tmp5375, i64 1
+  %tmp5377 = getelementptr inbounds float* %tmp5376, i64 1
+  %tmp5378 = getelementptr inbounds float* %tmp5377, i64 1
+  %tmp5379 = getelementptr inbounds float* %tmp5378, i64 1
+  %tmp5380 = getelementptr inbounds float* %tmp5379, i64 1
+  %tmp5381 = getelementptr inbounds float* %tmp5380, i64 1
+  %tmp5382 = getelementptr inbounds float* %tmp5381, i64 1
+  %tmp5383 = getelementptr inbounds float* %tmp5382, i64 1
+  %tmp5384 = getelementptr inbounds float* %tmp5383, i64 1
+  %tmp5385 = getelementptr inbounds float* %tmp5384, i64 1
+  %tmp5386 = getelementptr inbounds float* %tmp5385, i64 1
+  %tmp5387 = getelementptr inbounds float* %tmp5386, i64 1
+  %tmp5388 = getelementptr inbounds float* %tmp5387, i64 1
+  %tmp5389 = getelementptr inbounds float* %tmp5388, i64 1
+  %tmp5390 = getelementptr inbounds float* %tmp5389, i64 1
+  %tmp5391 = getelementptr inbounds float* %tmp5390, i64 1
+  %tmp5392 = getelementptr inbounds float* %tmp5391, i64 1
+  %tmp5393 = getelementptr inbounds float* %tmp5392, i64 1
+  %tmp5394 = getelementptr inbounds float* %tmp5393, i64 1
+  %tmp5395 = getelementptr inbounds float* %tmp5394, i64 1
+  %tmp5396 = getelementptr inbounds float* %tmp5395, i64 1
+  %tmp5397 = getelementptr inbounds float* %tmp5396, i64 1
+  %tmp5398 = getelementptr inbounds float* %tmp5397, i64 1
+  %tmp5399 = getelementptr inbounds float* %tmp5398, i64 1
+  %tmp5400 = getelementptr inbounds float* %tmp5399, i64 1
+  %tmp5401 = getelementptr inbounds float* %tmp5400, i64 1
+  %tmp5402 = getelementptr inbounds float* %tmp5401, i64 1
+  %tmp5403 = getelementptr inbounds float* %tmp5402, i64 1
+  %tmp5404 = getelementptr inbounds float* %tmp5403, i64 1
+  %tmp5405 = getelementptr inbounds float* %tmp5404, i64 1
+  %tmp5406 = getelementptr inbounds float* %tmp5405, i64 1
+  %tmp5407 = getelementptr inbounds float* %tmp5406, i64 1
+  %tmp5408 = getelementptr inbounds float* %tmp5407, i64 1
+  %tmp5409 = getelementptr inbounds float* %tmp5408, i64 1
+  %tmp5410 = getelementptr inbounds float* %tmp5409, i64 1
+  %tmp5411 = getelementptr inbounds float* %tmp5410, i64 1
+  %tmp5412 = getelementptr inbounds float* %tmp5411, i64 1
+  %tmp5413 = getelementptr inbounds float* %tmp5412, i64 1
+  %tmp5414 = getelementptr inbounds float* %tmp5413, i64 1
+  %tmp5415 = getelementptr inbounds float* %tmp5414, i64 1
+  %tmp5416 = getelementptr inbounds float* %tmp5415, i64 1
+  %tmp5417 = getelementptr inbounds float* %tmp5416, i64 1
+  %tmp5418 = getelementptr inbounds float* %tmp5417, i64 1
+  %tmp5419 = getelementptr inbounds float* %tmp5418, i64 1
+  %tmp5420 = getelementptr inbounds float* %tmp5419, i64 1
+  %tmp5421 = getelementptr inbounds float* %tmp5420, i64 1
+  %tmp5422 = getelementptr inbounds float* %tmp5421, i64 1
+  %tmp5423 = getelementptr inbounds float* %tmp5422, i64 1
+  %tmp5424 = getelementptr inbounds float* %tmp5423, i64 1
+  %tmp5425 = getelementptr inbounds float* %tmp5424, i64 1
+  %tmp5426 = getelementptr inbounds float* %tmp5425, i64 1
+  %tmp5427 = getelementptr inbounds float* %tmp5426, i64 1
+  %tmp5428 = getelementptr inbounds float* %tmp5427, i64 1
+  %tmp5429 = getelementptr inbounds float* %tmp5428, i64 1
+  %tmp5430 = getelementptr inbounds float* %tmp5429, i64 1
+  %tmp5431 = getelementptr inbounds float* %tmp5430, i64 1
+  %tmp5432 = getelementptr inbounds float* %tmp5431, i64 1
+  %tmp5433 = getelementptr inbounds float* %tmp5432, i64 1
+  %tmp5434 = getelementptr inbounds float* %tmp5433, i64 1
+  %tmp5435 = getelementptr inbounds float* %tmp5434, i64 1
+  %tmp5436 = getelementptr inbounds float* %tmp5435, i64 1
+  %tmp5437 = getelementptr inbounds float* %tmp5436, i64 1
+  %tmp5438 = getelementptr inbounds float* %tmp5437, i64 1
+  %tmp5439 = getelementptr inbounds float* %tmp5438, i64 1
+  %tmp5440 = getelementptr inbounds float* %tmp5439, i64 1
+  %tmp5441 = getelementptr inbounds float* %tmp5440, i64 1
+  %tmp5442 = getelementptr inbounds float* %tmp5441, i64 1
+  %tmp5443 = getelementptr inbounds float* %tmp5442, i64 1
+  %tmp5444 = getelementptr inbounds float* %tmp5443, i64 1
+  %tmp5445 = getelementptr inbounds float* %tmp5444, i64 1
+  %tmp5446 = getelementptr inbounds float* %tmp5445, i64 1
+  %tmp5447 = getelementptr inbounds float* %tmp5446, i64 1
+  %tmp5448 = getelementptr inbounds float* %tmp5447, i64 1
+  %tmp5449 = getelementptr inbounds float* %tmp5448, i64 1
+  %tmp5450 = getelementptr inbounds float* %tmp5449, i64 1
+  %tmp5451 = getelementptr inbounds float* %tmp5450, i64 1
+  %tmp5452 = getelementptr inbounds float* %tmp5451, i64 1
+  %tmp5453 = getelementptr inbounds float* %tmp5452, i64 1
+  %tmp5454 = getelementptr inbounds float* %tmp5453, i64 1
+  %tmp5455 = getelementptr inbounds float* %tmp5454, i64 1
+  %tmp5456 = getelementptr inbounds float* %tmp5455, i64 1
+  %tmp5457 = getelementptr inbounds float* %tmp5456, i64 1
+  %tmp5458 = getelementptr inbounds float* %tmp5457, i64 1
+  %tmp5459 = getelementptr inbounds float* %tmp5458, i64 1
+  %tmp5460 = getelementptr inbounds float* %tmp5459, i64 1
+  %tmp5461 = getelementptr inbounds float* %tmp5460, i64 1
+  %tmp5462 = getelementptr inbounds float* %tmp5461, i64 1
+  %tmp5463 = getelementptr inbounds float* %tmp5462, i64 1
+  %tmp5464 = getelementptr inbounds float* %tmp5463, i64 1
+  %tmp5465 = getelementptr inbounds float* %tmp5464, i64 1
+  %tmp5466 = getelementptr inbounds float* %tmp5465, i64 1
+  %tmp5467 = getelementptr inbounds float* %tmp5466, i64 1
+  %tmp5468 = getelementptr inbounds float* %tmp5467, i64 1
+  %tmp5469 = getelementptr inbounds float* %tmp5468, i64 1
+  %tmp5470 = getelementptr inbounds float* %tmp5469, i64 1
+  %tmp5471 = getelementptr inbounds float* %tmp5470, i64 1
+  %tmp5472 = getelementptr inbounds float* %tmp5471, i64 1
+  %tmp5473 = getelementptr inbounds float* %tmp5472, i64 1
+  %tmp5474 = getelementptr inbounds float* %tmp5473, i64 1
+  %tmp5475 = getelementptr inbounds float* %tmp5474, i64 1
+  %tmp5476 = getelementptr inbounds float* %tmp5475, i64 1
+  %tmp5477 = getelementptr inbounds float* %tmp5476, i64 1
+  %tmp5478 = getelementptr inbounds float* %tmp5477, i64 1
+  %tmp5479 = getelementptr inbounds float* %tmp5478, i64 1
+  %tmp5480 = getelementptr inbounds float* %tmp5479, i64 1
+  %tmp5481 = getelementptr inbounds float* %tmp5480, i64 1
+  %tmp5482 = getelementptr inbounds float* %tmp5481, i64 1
+  %tmp5483 = getelementptr inbounds float* %tmp5482, i64 1
+  %tmp5484 = getelementptr inbounds float* %tmp5483, i64 1
+  %tmp5485 = getelementptr inbounds float* %tmp5484, i64 1
+  %tmp5486 = getelementptr inbounds float* %tmp5485, i64 1
+  %tmp5487 = getelementptr inbounds float* %tmp5486, i64 1
+  %tmp5488 = getelementptr inbounds float* %tmp5487, i64 1
+  %tmp5489 = getelementptr inbounds float* %tmp5488, i64 1
+  %tmp5490 = getelementptr inbounds float* %tmp5489, i64 1
+  %tmp5491 = getelementptr inbounds float* %tmp5490, i64 1
+  %tmp5492 = getelementptr inbounds float* %tmp5491, i64 1
+  %tmp5493 = getelementptr inbounds float* %tmp5492, i64 1
+  %tmp5494 = getelementptr inbounds float* %tmp5493, i64 1
+  %tmp5495 = getelementptr inbounds float* %tmp5494, i64 1
+  %tmp5496 = getelementptr inbounds float* %tmp5495, i64 1
+  %tmp5497 = getelementptr inbounds float* %tmp5496, i64 1
+  %tmp5498 = getelementptr inbounds float* %tmp5497, i64 1
+  %tmp5499 = getelementptr inbounds float* %tmp5498, i64 1
+  %tmp5500 = getelementptr inbounds float* %tmp5499, i64 1
+  %tmp5501 = getelementptr inbounds float* %tmp5500, i64 1
+  %tmp5502 = getelementptr inbounds float* %tmp5501, i64 1
+  %tmp5503 = getelementptr inbounds float* %tmp5502, i64 1
+  %tmp5504 = getelementptr inbounds float* %tmp5503, i64 1
+  %tmp5505 = getelementptr inbounds float* %tmp5504, i64 1
+  %tmp5506 = getelementptr inbounds float* %tmp5505, i64 1
+  %tmp5507 = getelementptr inbounds float* %tmp5506, i64 1
+  %tmp5508 = getelementptr inbounds float* %tmp5507, i64 1
+  %tmp5509 = getelementptr inbounds float* %tmp5508, i64 1
+  %tmp5510 = getelementptr inbounds float* %tmp5509, i64 1
+  %tmp5511 = getelementptr inbounds float* %tmp5510, i64 1
+  %tmp5512 = getelementptr inbounds float* %tmp5511, i64 1
+  %tmp5513 = getelementptr inbounds float* %tmp5512, i64 1
+  %tmp5514 = getelementptr inbounds float* %tmp5513, i64 1
+  %tmp5515 = getelementptr inbounds float* %tmp5514, i64 1
+  %tmp5516 = getelementptr inbounds float* %tmp5515, i64 1
+  %tmp5517 = getelementptr inbounds float* %tmp5516, i64 1
+  %tmp5518 = getelementptr inbounds float* %tmp5517, i64 1
+  %tmp5519 = getelementptr inbounds float* %tmp5518, i64 1
+  %tmp5520 = getelementptr inbounds float* %tmp5519, i64 1
+  %tmp5521 = getelementptr inbounds float* %tmp5520, i64 1
+  %tmp5522 = getelementptr inbounds float* %tmp5521, i64 1
+  %tmp5523 = getelementptr inbounds float* %tmp5522, i64 1
+  %tmp5524 = getelementptr inbounds float* %tmp5523, i64 1
+  %tmp5525 = getelementptr inbounds float* %tmp5524, i64 1
+  %tmp5526 = getelementptr inbounds float* %tmp5525, i64 1
+  %tmp5527 = getelementptr inbounds float* %tmp5526, i64 1
+  %tmp5528 = getelementptr inbounds float* %tmp5527, i64 1
+  %tmp5529 = getelementptr inbounds float* %tmp5528, i64 1
+  %tmp5530 = getelementptr inbounds float* %tmp5529, i64 1
+  %tmp5531 = getelementptr inbounds float* %tmp5530, i64 1
+  %tmp5532 = getelementptr inbounds float* %tmp5531, i64 1
+  %tmp5533 = getelementptr inbounds float* %tmp5532, i64 1
+  %tmp5534 = getelementptr inbounds float* %tmp5533, i64 1
+  %tmp5535 = getelementptr inbounds float* %tmp5534, i64 1
+  %tmp5536 = getelementptr inbounds float* %tmp5535, i64 1
+  %tmp5537 = getelementptr inbounds float* %tmp5536, i64 1
+  %tmp5538 = getelementptr inbounds float* %tmp5537, i64 1
+  %tmp5539 = getelementptr inbounds float* %tmp5538, i64 1
+  %tmp5540 = getelementptr inbounds float* %tmp5539, i64 1
+  %tmp5541 = getelementptr inbounds float* %tmp5540, i64 1
+  %tmp5542 = getelementptr inbounds float* %tmp5541, i64 1
+  %tmp5543 = getelementptr inbounds float* %tmp5542, i64 1
+  %tmp5544 = getelementptr inbounds float* %tmp5543, i64 1
+  %tmp5545 = getelementptr inbounds float* %tmp5544, i64 1
+  %tmp5546 = getelementptr inbounds float* %tmp5545, i64 1
+  %tmp5547 = getelementptr inbounds float* %tmp5546, i64 1
+  %tmp5548 = getelementptr inbounds float* %tmp5547, i64 1
+  %tmp5549 = getelementptr inbounds float* %tmp5548, i64 1
+  %tmp5550 = getelementptr inbounds float* %tmp5549, i64 1
+  %tmp5551 = getelementptr inbounds float* %tmp5550, i64 1
+  %tmp5552 = getelementptr inbounds float* %tmp5551, i64 1
+  %tmp5553 = getelementptr inbounds float* %tmp5552, i64 1
+  %tmp5554 = getelementptr inbounds float* %tmp5553, i64 1
+  %tmp5555 = getelementptr inbounds float* %tmp5554, i64 1
+  %tmp5556 = getelementptr inbounds float* %tmp5555, i64 1
+  %tmp5557 = getelementptr inbounds float* %tmp5556, i64 1
+  %tmp5558 = getelementptr inbounds float* %tmp5557, i64 1
+  %tmp5559 = getelementptr inbounds float* %tmp5558, i64 1
+  %tmp5560 = getelementptr inbounds float* %tmp5559, i64 1
+  %tmp5561 = getelementptr inbounds float* %tmp5560, i64 1
+  %tmp5562 = getelementptr inbounds float* %tmp5561, i64 1
+  %tmp5563 = getelementptr inbounds float* %tmp5562, i64 1
+  %tmp5564 = getelementptr inbounds float* %tmp5563, i64 1
+  %tmp5565 = getelementptr inbounds float* %tmp5564, i64 1
+  %tmp5566 = getelementptr inbounds float* %tmp5565, i64 1
+  %tmp5567 = getelementptr inbounds float* %tmp5566, i64 1
+  %tmp5568 = getelementptr inbounds float* %tmp5567, i64 1
+  %tmp5569 = getelementptr inbounds float* %tmp5568, i64 1
+  %tmp5570 = getelementptr inbounds float* %tmp5569, i64 1
+  %tmp5571 = getelementptr inbounds float* %tmp5570, i64 1
+  %tmp5572 = getelementptr inbounds float* %tmp5571, i64 1
+  %tmp5573 = getelementptr inbounds float* %tmp5572, i64 1
+  %tmp5574 = getelementptr inbounds float* %tmp5573, i64 1
+  %tmp5575 = getelementptr inbounds float* %tmp5574, i64 1
+  %tmp5576 = getelementptr inbounds float* %tmp5575, i64 1
+  %tmp5577 = getelementptr inbounds float* %tmp5576, i64 1
+  %tmp5578 = getelementptr inbounds float* %tmp5577, i64 1
+  %tmp5579 = getelementptr inbounds float* %tmp5578, i64 1
+  %tmp5580 = getelementptr inbounds float* %tmp5579, i64 1
+  %tmp5581 = getelementptr inbounds float* %tmp5580, i64 1
+  %tmp5582 = getelementptr inbounds float* %tmp5581, i64 1
+  %tmp5583 = getelementptr inbounds float* %tmp5582, i64 1
+  %tmp5584 = getelementptr inbounds float* %tmp5583, i64 1
+  %tmp5585 = getelementptr inbounds float* %tmp5584, i64 1
+  %tmp5586 = getelementptr inbounds float* %tmp5585, i64 1
+  %tmp5587 = getelementptr inbounds float* %tmp5586, i64 1
+  %tmp5588 = getelementptr inbounds float* %tmp5587, i64 1
+  %tmp5589 = getelementptr inbounds float* %tmp5588, i64 1
+  %tmp5590 = getelementptr inbounds float* %tmp5589, i64 1
+  %tmp5591 = getelementptr inbounds float* %tmp5590, i64 1
+  %tmp5592 = getelementptr inbounds float* %tmp5591, i64 1
+  %tmp5593 = getelementptr inbounds float* %tmp5592, i64 1
+  %tmp5594 = getelementptr inbounds float* %tmp5593, i64 1
+  %tmp5595 = getelementptr inbounds float* %tmp5594, i64 1
+  %tmp5596 = getelementptr inbounds float* %tmp5595, i64 1
+  %tmp5597 = getelementptr inbounds float* %tmp5596, i64 1
+  %tmp5598 = getelementptr inbounds float* %tmp5597, i64 1
+  %tmp5599 = getelementptr inbounds float* %tmp5598, i64 1
+  %tmp5600 = getelementptr inbounds float* %tmp5599, i64 1
+  %tmp5601 = getelementptr inbounds float* %tmp5600, i64 1
+  %tmp5602 = getelementptr inbounds float* %tmp5601, i64 1
+  %tmp5603 = getelementptr inbounds float* %tmp5602, i64 1
+  %tmp5604 = getelementptr inbounds float* %tmp5603, i64 1
+  %tmp5605 = getelementptr inbounds float* %tmp5604, i64 1
+  %tmp5606 = getelementptr inbounds float* %tmp5605, i64 1
+  %tmp5607 = getelementptr inbounds float* %tmp5606, i64 1
+  %tmp5608 = getelementptr inbounds float* %tmp5607, i64 1
+  %tmp5609 = getelementptr inbounds float* %tmp5608, i64 1
+  %tmp5610 = getelementptr inbounds float* %tmp5609, i64 1
+  %tmp5611 = getelementptr inbounds float* %tmp5610, i64 1
+  %tmp5612 = getelementptr inbounds float* %tmp5611, i64 1
+  %tmp5613 = getelementptr inbounds float* %tmp5612, i64 1
+  %tmp5614 = getelementptr inbounds float* %tmp5613, i64 1
+  %tmp5615 = getelementptr inbounds float* %tmp5614, i64 1
+  %tmp5616 = getelementptr inbounds float* %tmp5615, i64 1
+  %tmp5617 = getelementptr inbounds float* %tmp5616, i64 1
+  %tmp5618 = getelementptr inbounds float* %tmp5617, i64 1
+  %tmp5619 = getelementptr inbounds float* %tmp5618, i64 1
+  %tmp5620 = getelementptr inbounds float* %tmp5619, i64 1
+  %tmp5621 = getelementptr inbounds float* %tmp5620, i64 1
+  %tmp5622 = getelementptr inbounds float* %tmp5621, i64 1
+  %tmp5623 = getelementptr inbounds float* %tmp5622, i64 1
+  %tmp5624 = getelementptr inbounds float* %tmp5623, i64 1
+  %tmp5625 = getelementptr inbounds float* %tmp5624, i64 1
+  %tmp5626 = getelementptr inbounds float* %tmp5625, i64 1
+  %tmp5627 = getelementptr inbounds float* %tmp5626, i64 1
+  %tmp5628 = getelementptr inbounds float* %tmp5627, i64 1
+  %tmp5629 = getelementptr inbounds float* %tmp5628, i64 1
+  %tmp5630 = getelementptr inbounds float* %tmp5629, i64 1
+  %tmp5631 = getelementptr inbounds float* %tmp5630, i64 1
+  %tmp5632 = getelementptr inbounds float* %tmp5631, i64 1
+  %tmp5633 = getelementptr inbounds float* %tmp5632, i64 1
+  %tmp5634 = getelementptr inbounds float* %tmp5633, i64 1
+  %tmp5635 = getelementptr inbounds float* %tmp5634, i64 1
+  %tmp5636 = getelementptr inbounds float* %tmp5635, i64 1
+  %tmp5637 = getelementptr inbounds float* %tmp5636, i64 1
+  %tmp5638 = getelementptr inbounds float* %tmp5637, i64 1
+  %tmp5639 = getelementptr inbounds float* %tmp5638, i64 1
+  %tmp5640 = getelementptr inbounds float* %tmp5639, i64 1
+  %tmp5641 = getelementptr inbounds float* %tmp5640, i64 1
+  %tmp5642 = getelementptr inbounds float* %tmp5641, i64 1
+  %tmp5643 = getelementptr inbounds float* %tmp5642, i64 1
+  %tmp5644 = getelementptr inbounds float* %tmp5643, i64 1
+  %tmp5645 = getelementptr inbounds float* %tmp5644, i64 1
+  %tmp5646 = getelementptr inbounds float* %tmp5645, i64 1
+  %tmp5647 = getelementptr inbounds float* %tmp5646, i64 1
+  %tmp5648 = getelementptr inbounds float* %tmp5647, i64 1
+  %tmp5649 = getelementptr inbounds float* %tmp5648, i64 1
+  %tmp5650 = getelementptr inbounds float* %tmp5649, i64 1
+  %tmp5651 = getelementptr inbounds float* %tmp5650, i64 1
+  %tmp5652 = getelementptr inbounds float* %tmp5651, i64 1
+  %tmp5653 = getelementptr inbounds float* %tmp5652, i64 1
+  %tmp5654 = getelementptr inbounds float* %tmp5653, i64 1
+  %tmp5655 = getelementptr inbounds float* %tmp5654, i64 1
+  %tmp5656 = getelementptr inbounds float* %tmp5655, i64 1
+  %tmp5657 = getelementptr inbounds float* %tmp5656, i64 1
+  %tmp5658 = getelementptr inbounds float* %tmp5657, i64 1
+  %tmp5659 = getelementptr inbounds float* %tmp5658, i64 1
+  %tmp5660 = getelementptr inbounds float* %tmp5659, i64 1
+  %tmp5661 = getelementptr inbounds float* %tmp5660, i64 1
+  %tmp5662 = getelementptr inbounds float* %tmp5661, i64 1
+  %tmp5663 = getelementptr inbounds float* %tmp5662, i64 1
+  %tmp5664 = getelementptr inbounds float* %tmp5663, i64 1
+  %tmp5665 = getelementptr inbounds float* %tmp5664, i64 1
+  %tmp5666 = getelementptr inbounds float* %tmp5665, i64 1
+  %tmp5667 = getelementptr inbounds float* %tmp5666, i64 1
+  %tmp5668 = getelementptr inbounds float* %tmp5667, i64 1
+  %tmp5669 = getelementptr inbounds float* %tmp5668, i64 1
+  %tmp5670 = getelementptr inbounds float* %tmp5669, i64 1
+  %tmp5671 = getelementptr inbounds float* %tmp5670, i64 1
+  %tmp5672 = getelementptr inbounds float* %tmp5671, i64 1
+  %tmp5673 = getelementptr inbounds float* %tmp5672, i64 1
+  %tmp5674 = getelementptr inbounds float* %tmp5673, i64 1
+  %tmp5675 = getelementptr inbounds float* %tmp5674, i64 1
+  %tmp5676 = getelementptr inbounds float* %tmp5675, i64 1
+  %tmp5677 = getelementptr inbounds float* %tmp5676, i64 1
+  %tmp5678 = getelementptr inbounds float* %tmp5677, i64 1
+  %tmp5679 = getelementptr inbounds float* %tmp5678, i64 1
+  %tmp5680 = getelementptr inbounds float* %tmp5679, i64 1
+  %tmp5681 = getelementptr inbounds float* %tmp5680, i64 1
+  %tmp5682 = getelementptr inbounds float* %tmp5681, i64 1
+  %tmp5683 = getelementptr inbounds float* %tmp5682, i64 1
+  %tmp5684 = getelementptr inbounds float* %tmp5683, i64 1
+  %tmp5685 = getelementptr inbounds float* %tmp5684, i64 1
+  %tmp5686 = getelementptr inbounds float* %tmp5685, i64 1
+  %tmp5687 = getelementptr inbounds float* %tmp5686, i64 1
+  %tmp5688 = getelementptr inbounds float* %tmp5687, i64 1
+  %tmp5689 = getelementptr inbounds float* %tmp5688, i64 1
+  %tmp5690 = getelementptr inbounds float* %tmp5689, i64 1
+  %tmp5691 = getelementptr inbounds float* %tmp5690, i64 1
+  %tmp5692 = getelementptr inbounds float* %tmp5691, i64 1
+  %tmp5693 = getelementptr inbounds float* %tmp5692, i64 1
+  %tmp5694 = getelementptr inbounds float* %tmp5693, i64 1
+  %tmp5695 = getelementptr inbounds float* %tmp5694, i64 1
+  %tmp5696 = getelementptr inbounds float* %tmp5695, i64 1
+  %tmp5697 = getelementptr inbounds float* %tmp5696, i64 1
+  %tmp5698 = getelementptr inbounds float* %tmp5697, i64 1
+  %tmp5699 = getelementptr inbounds float* %tmp5698, i64 1
+  %tmp5700 = getelementptr inbounds float* %tmp5699, i64 1
+  %tmp5701 = getelementptr inbounds float* %tmp5700, i64 1
+  %tmp5702 = getelementptr inbounds float* %tmp5701, i64 1
+  %tmp5703 = getelementptr inbounds float* %tmp5702, i64 1
+  %tmp5704 = getelementptr inbounds float* %tmp5703, i64 1
+  %tmp5705 = getelementptr inbounds float* %tmp5704, i64 1
+  %tmp5706 = getelementptr inbounds float* %tmp5705, i64 1
+  %tmp5707 = getelementptr inbounds float* %tmp5706, i64 1
+  %tmp5708 = getelementptr inbounds float* %tmp5707, i64 1
+  %tmp5709 = getelementptr inbounds float* %tmp5708, i64 1
+  %tmp5710 = getelementptr inbounds float* %tmp5709, i64 1
+  %tmp5711 = getelementptr inbounds float* %tmp5710, i64 1
+  %tmp5712 = getelementptr inbounds float* %tmp5711, i64 1
+  %tmp5713 = getelementptr inbounds float* %tmp5712, i64 1
+  %tmp5714 = getelementptr inbounds float* %tmp5713, i64 1
+  %tmp5715 = getelementptr inbounds float* %tmp5714, i64 1
+  %tmp5716 = getelementptr inbounds float* %tmp5715, i64 1
+  %tmp5717 = getelementptr inbounds float* %tmp5716, i64 1
+  %tmp5718 = getelementptr inbounds float* %tmp5717, i64 1
+  %tmp5719 = getelementptr inbounds float* %tmp5718, i64 1
+  %tmp5720 = getelementptr inbounds float* %tmp5719, i64 1
+  %tmp5721 = getelementptr inbounds float* %tmp5720, i64 1
+  %tmp5722 = getelementptr inbounds float* %tmp5721, i64 1
+  %tmp5723 = getelementptr inbounds float* %tmp5722, i64 1
+  %tmp5724 = getelementptr inbounds float* %tmp5723, i64 1
+  %tmp5725 = getelementptr inbounds float* %tmp5724, i64 1
+  %tmp5726 = getelementptr inbounds float* %tmp5725, i64 1
+  %tmp5727 = getelementptr inbounds float* %tmp5726, i64 1
+  %tmp5728 = getelementptr inbounds float* %tmp5727, i64 1
+  %tmp5729 = getelementptr inbounds float* %tmp5728, i64 1
+  %tmp5730 = getelementptr inbounds float* %tmp5729, i64 1
+  %tmp5731 = getelementptr inbounds float* %tmp5730, i64 1
+  %tmp5732 = getelementptr inbounds float* %tmp5731, i64 1
+  %tmp5733 = getelementptr inbounds float* %tmp5732, i64 1
+  %tmp5734 = getelementptr inbounds float* %tmp5733, i64 1
+  %tmp5735 = getelementptr inbounds float* %tmp5734, i64 1
+  %tmp5736 = getelementptr inbounds float* %tmp5735, i64 1
+  %tmp5737 = getelementptr inbounds float* %tmp5736, i64 1
+  %tmp5738 = getelementptr inbounds float* %tmp5737, i64 1
+  %tmp5739 = getelementptr inbounds float* %tmp5738, i64 1
+  %tmp5740 = getelementptr inbounds float* %tmp5739, i64 1
+  %tmp5741 = getelementptr inbounds float* %tmp5740, i64 1
+  %tmp5742 = getelementptr inbounds float* %tmp5741, i64 1
+  %tmp5743 = getelementptr inbounds float* %tmp5742, i64 1
+  %tmp5744 = getelementptr inbounds float* %tmp5743, i64 1
+  %tmp5745 = getelementptr inbounds float* %tmp5744, i64 1
+  %tmp5746 = getelementptr inbounds float* %tmp5745, i64 1
+  %tmp5747 = getelementptr inbounds float* %tmp5746, i64 1
+  %tmp5748 = getelementptr inbounds float* %tmp5747, i64 1
+  %tmp5749 = getelementptr inbounds float* %tmp5748, i64 1
+  %tmp5750 = getelementptr inbounds float* %tmp5749, i64 1
+  %tmp5751 = getelementptr inbounds float* %tmp5750, i64 1
+  %tmp5752 = getelementptr inbounds float* %tmp5751, i64 1
+  %tmp5753 = getelementptr inbounds float* %tmp5752, i64 1
+  %tmp5754 = getelementptr inbounds float* %tmp5753, i64 1
+  %tmp5755 = getelementptr inbounds float* %tmp5754, i64 1
+  %tmp5756 = getelementptr inbounds float* %tmp5755, i64 1
+  %tmp5757 = getelementptr inbounds float* %tmp5756, i64 1
+  %tmp5758 = getelementptr inbounds float* %tmp5757, i64 1
+  %tmp5759 = getelementptr inbounds float* %tmp5758, i64 1
+  %tmp5760 = getelementptr inbounds float* %tmp5759, i64 1
+  %tmp5761 = getelementptr inbounds float* %tmp5760, i64 1
+  %tmp5762 = getelementptr inbounds float* %tmp5761, i64 1
+  %tmp5763 = getelementptr inbounds float* %tmp5762, i64 1
+  %tmp5764 = getelementptr inbounds float* %tmp5763, i64 1
+  %tmp5765 = getelementptr inbounds float* %tmp5764, i64 1
+  %tmp5766 = getelementptr inbounds float* %tmp5765, i64 1
+  %tmp5767 = getelementptr inbounds float* %tmp5766, i64 1
+  %tmp5768 = getelementptr inbounds float* %tmp5767, i64 1
+  %tmp5769 = getelementptr inbounds float* %tmp5768, i64 1
+  %tmp5770 = getelementptr inbounds float* %tmp5769, i64 1
+  %tmp5771 = getelementptr inbounds float* %tmp5770, i64 1
+  %tmp5772 = getelementptr inbounds float* %tmp5771, i64 1
+  %tmp5773 = getelementptr inbounds float* %tmp5772, i64 1
+  %tmp5774 = getelementptr inbounds float* %tmp5773, i64 1
+  %tmp5775 = getelementptr inbounds float* %tmp5774, i64 1
+  %tmp5776 = getelementptr inbounds float* %tmp5775, i64 1
+  %tmp5777 = getelementptr inbounds float* %tmp5776, i64 1
+  %tmp5778 = getelementptr inbounds float* %tmp5777, i64 1
+  %tmp5779 = getelementptr inbounds float* %tmp5778, i64 1
+  %tmp5780 = getelementptr inbounds float* %tmp5779, i64 1
+  %tmp5781 = getelementptr inbounds float* %tmp5780, i64 1
+  %tmp5782 = getelementptr inbounds float* %tmp5781, i64 1
+  %tmp5783 = getelementptr inbounds float* %tmp5782, i64 1
+  %tmp5784 = getelementptr inbounds float* %tmp5783, i64 1
+  %tmp5785 = getelementptr inbounds float* %tmp5784, i64 1
+  %tmp5786 = getelementptr inbounds float* %tmp5785, i64 1
+  %tmp5787 = getelementptr inbounds float* %tmp5786, i64 1
+  %tmp5788 = getelementptr inbounds float* %tmp5787, i64 1
+  %tmp5789 = getelementptr inbounds float* %tmp5788, i64 1
+  %tmp5790 = getelementptr inbounds float* %tmp5789, i64 1
+  %tmp5791 = getelementptr inbounds float* %tmp5790, i64 1
+  %tmp5792 = getelementptr inbounds float* %tmp5791, i64 1
+  %tmp5793 = getelementptr inbounds float* %tmp5792, i64 1
+  %tmp5794 = getelementptr inbounds float* %tmp5793, i64 1
+  %tmp5795 = getelementptr inbounds float* %tmp5794, i64 1
+  %tmp5796 = getelementptr inbounds float* %tmp5795, i64 1
+  %tmp5797 = getelementptr inbounds float* %tmp5796, i64 1
+  %tmp5798 = getelementptr inbounds float* %tmp5797, i64 1
+  %tmp5799 = getelementptr inbounds float* %tmp5798, i64 1
+  %tmp5800 = getelementptr inbounds float* %tmp5799, i64 1
+  %tmp5801 = getelementptr inbounds float* %tmp5800, i64 1
+  %tmp5802 = getelementptr inbounds float* %tmp5801, i64 1
+  %tmp5803 = getelementptr inbounds float* %tmp5802, i64 1
+  %tmp5804 = getelementptr inbounds float* %tmp5803, i64 1
+  %tmp5805 = getelementptr inbounds float* %tmp5804, i64 1
+  %tmp5806 = getelementptr inbounds float* %tmp5805, i64 1
+  %tmp5807 = getelementptr inbounds float* %tmp5806, i64 1
+  %tmp5808 = getelementptr inbounds float* %tmp5807, i64 1
+  %tmp5809 = getelementptr inbounds float* %tmp5808, i64 1
+  %tmp5810 = getelementptr inbounds float* %tmp5809, i64 1
+  %tmp5811 = getelementptr inbounds float* %tmp5810, i64 1
+  %tmp5812 = getelementptr inbounds float* %tmp5811, i64 1
+  %tmp5813 = getelementptr inbounds float* %tmp5812, i64 1
+  %tmp5814 = getelementptr inbounds float* %tmp5813, i64 1
+  %tmp5815 = getelementptr inbounds float* %tmp5814, i64 1
+  %tmp5816 = getelementptr inbounds float* %tmp5815, i64 1
+  %tmp5817 = getelementptr inbounds float* %tmp5816, i64 1
+  %tmp5818 = getelementptr inbounds float* %tmp5817, i64 1
+  %tmp5819 = getelementptr inbounds float* %tmp5818, i64 1
+  %tmp5820 = getelementptr inbounds float* %tmp5819, i64 1
+  %tmp5821 = getelementptr inbounds float* %tmp5820, i64 1
+  %tmp5822 = getelementptr inbounds float* %tmp5821, i64 1
+  %tmp5823 = getelementptr inbounds float* %tmp5822, i64 1
+  %tmp5824 = getelementptr inbounds float* %tmp5823, i64 1
+  %tmp5825 = getelementptr inbounds float* %tmp5824, i64 1
+  %tmp5826 = getelementptr inbounds float* %tmp5825, i64 1
+  %tmp5827 = getelementptr inbounds float* %tmp5826, i64 1
+  %tmp5828 = getelementptr inbounds float* %tmp5827, i64 1
+  %tmp5829 = getelementptr inbounds float* %tmp5828, i64 1
+  %tmp5830 = getelementptr inbounds float* %tmp5829, i64 1
+  %tmp5831 = getelementptr inbounds float* %tmp5830, i64 1
+  %tmp5832 = getelementptr inbounds float* %tmp5831, i64 1
+  %tmp5833 = getelementptr inbounds float* %tmp5832, i64 1
+  %tmp5834 = getelementptr inbounds float* %tmp5833, i64 1
+  %tmp5835 = getelementptr inbounds float* %tmp5834, i64 1
+  %tmp5836 = getelementptr inbounds float* %tmp5835, i64 1
+  %tmp5837 = getelementptr inbounds float* %tmp5836, i64 1
+  %tmp5838 = getelementptr inbounds float* %tmp5837, i64 1
+  %tmp5839 = getelementptr inbounds float* %tmp5838, i64 1
+  %tmp5840 = getelementptr inbounds float* %tmp5839, i64 1
+  %tmp5841 = getelementptr inbounds float* %tmp5840, i64 1
+  %tmp5842 = getelementptr inbounds float* %tmp5841, i64 1
+  %tmp5843 = getelementptr inbounds float* %tmp5842, i64 1
+  %tmp5844 = getelementptr inbounds float* %tmp5843, i64 1
+  %tmp5845 = getelementptr inbounds float* %tmp5844, i64 1
+  %tmp5846 = getelementptr inbounds float* %tmp5845, i64 1
+  %tmp5847 = getelementptr inbounds float* %tmp5846, i64 1
+  %tmp5848 = getelementptr inbounds float* %tmp5847, i64 1
+  %tmp5849 = getelementptr inbounds float* %tmp5848, i64 1
+  %tmp5850 = getelementptr inbounds float* %tmp5849, i64 1
+  %tmp5851 = getelementptr inbounds float* %tmp5850, i64 1
+  %tmp5852 = getelementptr inbounds float* %tmp5851, i64 1
+  %tmp5853 = getelementptr inbounds float* %tmp5852, i64 1
+  %tmp5854 = getelementptr inbounds float* %tmp5853, i64 1
+  %tmp5855 = getelementptr inbounds float* %tmp5854, i64 1
+  %tmp5856 = getelementptr inbounds float* %tmp5855, i64 1
+  %tmp5857 = getelementptr inbounds float* %tmp5856, i64 1
+  %tmp5858 = getelementptr inbounds float* %tmp5857, i64 1
+  %tmp5859 = getelementptr inbounds float* %tmp5858, i64 1
+  %tmp5860 = getelementptr inbounds float* %tmp5859, i64 1
+  %tmp5861 = getelementptr inbounds float* %tmp5860, i64 1
+  %tmp5862 = getelementptr inbounds float* %tmp5861, i64 1
+  %tmp5863 = getelementptr inbounds float* %tmp5862, i64 1
+  %tmp5864 = getelementptr inbounds float* %tmp5863, i64 1
+  %tmp5865 = getelementptr inbounds float* %tmp5864, i64 1
+  %tmp5866 = getelementptr inbounds float* %tmp5865, i64 1
+  %tmp5867 = getelementptr inbounds float* %tmp5866, i64 1
+  %tmp5868 = getelementptr inbounds float* %tmp5867, i64 1
+  %tmp5869 = getelementptr inbounds float* %tmp5868, i64 1
+  %tmp5870 = getelementptr inbounds float* %tmp5869, i64 1
+  %tmp5871 = getelementptr inbounds float* %tmp5870, i64 1
+  %tmp5872 = getelementptr inbounds float* %tmp5871, i64 1
+  %tmp5873 = getelementptr inbounds float* %tmp5872, i64 1
+  %tmp5874 = getelementptr inbounds float* %tmp5873, i64 1
+  %tmp5875 = getelementptr inbounds float* %tmp5874, i64 1
+  %tmp5876 = getelementptr inbounds float* %tmp5875, i64 1
+  %tmp5877 = getelementptr inbounds float* %tmp5876, i64 1
+  %tmp5878 = getelementptr inbounds float* %tmp5877, i64 1
+  %tmp5879 = getelementptr inbounds float* %tmp5878, i64 1
+  %tmp5880 = getelementptr inbounds float* %tmp5879, i64 1
+  %tmp5881 = getelementptr inbounds float* %tmp5880, i64 1
+  %tmp5882 = getelementptr inbounds float* %tmp5881, i64 1
+  %tmp5883 = getelementptr inbounds float* %tmp5882, i64 1
+  %tmp5884 = getelementptr inbounds float* %tmp5883, i64 1
+  %tmp5885 = getelementptr inbounds float* %tmp5884, i64 1
+  %tmp5886 = getelementptr inbounds float* %tmp5885, i64 1
+  %tmp5887 = getelementptr inbounds float* %tmp5886, i64 1
+  %tmp5888 = getelementptr inbounds float* %tmp5887, i64 1
+  %tmp5889 = getelementptr inbounds float* %tmp5888, i64 1
+  %tmp5890 = getelementptr inbounds float* %tmp5889, i64 1
+  %tmp5891 = getelementptr inbounds float* %tmp5890, i64 1
+  %tmp5892 = getelementptr inbounds float* %tmp5891, i64 1
+  %tmp5893 = getelementptr inbounds float* %tmp5892, i64 1
+  %tmp5894 = getelementptr inbounds float* %tmp5893, i64 1
+  %tmp5895 = getelementptr inbounds float* %tmp5894, i64 1
+  %tmp5896 = getelementptr inbounds float* %tmp5895, i64 1
+  %tmp5897 = getelementptr inbounds float* %tmp5896, i64 1
+  %tmp5898 = getelementptr inbounds float* %tmp5897, i64 1
+  %tmp5899 = getelementptr inbounds float* %tmp5898, i64 1
+  %tmp5900 = getelementptr inbounds float* %tmp5899, i64 1
+  %tmp5901 = getelementptr inbounds float* %tmp5900, i64 1
+  %tmp5902 = getelementptr inbounds float* %tmp5901, i64 1
+  %tmp5903 = getelementptr inbounds float* %tmp5902, i64 1
+  %tmp5904 = getelementptr inbounds float* %tmp5903, i64 1
+  %tmp5905 = getelementptr inbounds float* %tmp5904, i64 1
+  %tmp5906 = getelementptr inbounds float* %tmp5905, i64 1
+  %tmp5907 = getelementptr inbounds float* %tmp5906, i64 1
+  %tmp5908 = getelementptr inbounds float* %tmp5907, i64 1
+  %tmp5909 = getelementptr inbounds float* %tmp5908, i64 1
+  %tmp5910 = getelementptr inbounds float* %tmp5909, i64 1
+  %tmp5911 = getelementptr inbounds float* %tmp5910, i64 1
+  %tmp5912 = getelementptr inbounds float* %tmp5911, i64 1
+  %tmp5913 = getelementptr inbounds float* %tmp5912, i64 1
+  %tmp5914 = getelementptr inbounds float* %tmp5913, i64 1
+  %tmp5915 = getelementptr inbounds float* %tmp5914, i64 1
+  %tmp5916 = getelementptr inbounds float* %tmp5915, i64 1
+  %tmp5917 = getelementptr inbounds float* %tmp5916, i64 1
+  %tmp5918 = getelementptr inbounds float* %tmp5917, i64 1
+  %tmp5919 = getelementptr inbounds float* %tmp5918, i64 1
+  %tmp5920 = getelementptr inbounds float* %tmp5919, i64 1
+  %tmp5921 = getelementptr inbounds float* %tmp5920, i64 1
+  %tmp5922 = getelementptr inbounds float* %tmp5921, i64 1
+  %tmp5923 = getelementptr inbounds float* %tmp5922, i64 1
+  %tmp5924 = getelementptr inbounds float* %tmp5923, i64 1
+  %tmp5925 = getelementptr inbounds float* %tmp5924, i64 1
+  %tmp5926 = getelementptr inbounds float* %tmp5925, i64 1
+  %tmp5927 = getelementptr inbounds float* %tmp5926, i64 1
+  %tmp5928 = getelementptr inbounds float* %tmp5927, i64 1
+  %tmp5929 = getelementptr inbounds float* %tmp5928, i64 1
+  %tmp5930 = getelementptr inbounds float* %tmp5929, i64 1
+  %tmp5931 = getelementptr inbounds float* %tmp5930, i64 1
+  %tmp5932 = getelementptr inbounds float* %tmp5931, i64 1
+  %tmp5933 = getelementptr inbounds float* %tmp5932, i64 1
+  %tmp5934 = getelementptr inbounds float* %tmp5933, i64 1
+  %tmp5935 = getelementptr inbounds float* %tmp5934, i64 1
+  %tmp5936 = getelementptr inbounds float* %tmp5935, i64 1
+  %tmp5937 = getelementptr inbounds float* %tmp5936, i64 1
+  %tmp5938 = getelementptr inbounds float* %tmp5937, i64 1
+  %tmp5939 = getelementptr inbounds float* %tmp5938, i64 1
+  %tmp5940 = getelementptr inbounds float* %tmp5939, i64 1
+  %tmp5941 = getelementptr inbounds float* %tmp5940, i64 1
+  %tmp5942 = getelementptr inbounds float* %tmp5941, i64 1
+  %tmp5943 = getelementptr inbounds float* %tmp5942, i64 1
+  %tmp5944 = getelementptr inbounds float* %tmp5943, i64 1
+  %tmp5945 = getelementptr inbounds float* %tmp5944, i64 1
+  %tmp5946 = getelementptr inbounds float* %tmp5945, i64 1
+  %tmp5947 = getelementptr inbounds float* %tmp5946, i64 1
+  %tmp5948 = getelementptr inbounds float* %tmp5947, i64 1
+  %tmp5949 = getelementptr inbounds float* %tmp5948, i64 1
+  %tmp5950 = getelementptr inbounds float* %tmp5949, i64 1
+  %tmp5951 = getelementptr inbounds float* %tmp5950, i64 1
+  %tmp5952 = getelementptr inbounds float* %tmp5951, i64 1
+  %tmp5953 = getelementptr inbounds float* %tmp5952, i64 1
+  %tmp5954 = getelementptr inbounds float* %tmp5953, i64 1
+  %tmp5955 = getelementptr inbounds float* %tmp5954, i64 1
+  %tmp5956 = getelementptr inbounds float* %tmp5955, i64 1
+  %tmp5957 = getelementptr inbounds float* %tmp5956, i64 1
+  %tmp5958 = getelementptr inbounds float* %tmp5957, i64 1
+  %tmp5959 = getelementptr inbounds float* %tmp5958, i64 1
+  %tmp5960 = getelementptr inbounds float* %tmp5959, i64 1
+  %tmp5961 = getelementptr inbounds float* %tmp5960, i64 1
+  %tmp5962 = getelementptr inbounds float* %tmp5961, i64 1
+  %tmp5963 = getelementptr inbounds float* %tmp5962, i64 1
+  %tmp5964 = getelementptr inbounds float* %tmp5963, i64 1
+  %tmp5965 = getelementptr inbounds float* %tmp5964, i64 1
+  %tmp5966 = getelementptr inbounds float* %tmp5965, i64 1
+  %tmp5967 = getelementptr inbounds float* %tmp5966, i64 1
+  %tmp5968 = getelementptr inbounds float* %tmp5967, i64 1
+  %tmp5969 = getelementptr inbounds float* %tmp5968, i64 1
+  %tmp5970 = getelementptr inbounds float* %tmp5969, i64 1
+  %tmp5971 = getelementptr inbounds float* %tmp5970, i64 1
+  %tmp5972 = getelementptr inbounds float* %tmp5971, i64 1
+  %tmp5973 = getelementptr inbounds float* %tmp5972, i64 1
+  %tmp5974 = getelementptr inbounds float* %tmp5973, i64 1
+  %tmp5975 = getelementptr inbounds float* %tmp5974, i64 1
+  %tmp5976 = getelementptr inbounds float* %tmp5975, i64 1
+  %tmp5977 = getelementptr inbounds float* %tmp5976, i64 1
+  %tmp5978 = getelementptr inbounds float* %tmp5977, i64 1
+  %tmp5979 = getelementptr inbounds float* %tmp5978, i64 1
+  %tmp5980 = getelementptr inbounds float* %tmp5979, i64 1
+  %tmp5981 = getelementptr inbounds float* %tmp5980, i64 1
+  %tmp5982 = getelementptr inbounds float* %tmp5981, i64 1
+  %tmp5983 = getelementptr inbounds float* %tmp5982, i64 1
+  %tmp5984 = getelementptr inbounds float* %tmp5983, i64 1
+  %tmp5985 = getelementptr inbounds float* %tmp5984, i64 1
+  %tmp5986 = getelementptr inbounds float* %tmp5985, i64 1
+  %tmp5987 = getelementptr inbounds float* %tmp5986, i64 1
+  %tmp5988 = getelementptr inbounds float* %tmp5987, i64 1
+  %tmp5989 = getelementptr inbounds float* %tmp5988, i64 1
+  %tmp5990 = getelementptr inbounds float* %tmp5989, i64 1
+  %tmp5991 = getelementptr inbounds float* %tmp5990, i64 1
+  %tmp5992 = getelementptr inbounds float* %tmp5991, i64 1
+  %tmp5993 = getelementptr inbounds float* %tmp5992, i64 1
+  %tmp5994 = getelementptr inbounds float* %tmp5993, i64 1
+  %tmp5995 = getelementptr inbounds float* %tmp5994, i64 1
+  %tmp5996 = getelementptr inbounds float* %tmp5995, i64 1
+  %tmp5997 = getelementptr inbounds float* %tmp5996, i64 1
+  %tmp5998 = getelementptr inbounds float* %tmp5997, i64 1
+  %tmp5999 = getelementptr inbounds float* %tmp5998, i64 1
+  %tmp6000 = getelementptr inbounds float* %tmp5999, i64 1
+  %tmp6001 = getelementptr inbounds float* %tmp6000, i64 1
+  %tmp6002 = getelementptr inbounds float* %tmp6001, i64 1
+  %tmp6003 = getelementptr inbounds float* %tmp6002, i64 1
+  %tmp6004 = getelementptr inbounds float* %tmp6003, i64 1
+  %tmp6005 = getelementptr inbounds float* %tmp6004, i64 1
+  %tmp6006 = getelementptr inbounds float* %tmp6005, i64 1
+  %tmp6007 = getelementptr inbounds float* %tmp6006, i64 1
+  %tmp6008 = getelementptr inbounds float* %tmp6007, i64 1
+  %tmp6009 = getelementptr inbounds float* %tmp6008, i64 1
+  %tmp6010 = getelementptr inbounds float* %tmp6009, i64 1
+  %tmp6011 = getelementptr inbounds float* %tmp6010, i64 1
+  %tmp6012 = getelementptr inbounds float* %tmp6011, i64 1
+  %tmp6013 = getelementptr inbounds float* %tmp6012, i64 1
+  %tmp6014 = getelementptr inbounds float* %tmp6013, i64 1
+  %tmp6015 = getelementptr inbounds float* %tmp6014, i64 1
+  %tmp6016 = getelementptr inbounds float* %tmp6015, i64 1
+  %tmp6017 = getelementptr inbounds float* %tmp6016, i64 1
+  %tmp6018 = getelementptr inbounds float* %tmp6017, i64 1
+  %tmp6019 = getelementptr inbounds float* %tmp6018, i64 1
+  %tmp6020 = getelementptr inbounds float* %tmp6019, i64 1
+  %tmp6021 = getelementptr inbounds float* %tmp6020, i64 1
+  %tmp6022 = getelementptr inbounds float* %tmp6021, i64 1
+  %tmp6023 = getelementptr inbounds float* %tmp6022, i64 1
+  %tmp6024 = getelementptr inbounds float* %tmp6023, i64 1
+  %tmp6025 = getelementptr inbounds float* %tmp6024, i64 1
+  %tmp6026 = getelementptr inbounds float* %tmp6025, i64 1
+  %tmp6027 = getelementptr inbounds float* %tmp6026, i64 1
+  %tmp6028 = getelementptr inbounds float* %tmp6027, i64 1
+  %tmp6029 = getelementptr inbounds float* %tmp6028, i64 1
+  %tmp6030 = getelementptr inbounds float* %tmp6029, i64 1
+  %tmp6031 = getelementptr inbounds float* %tmp6030, i64 1
+  %tmp6032 = getelementptr inbounds float* %tmp6031, i64 1
+  %tmp6033 = getelementptr inbounds float* %tmp6032, i64 1
+  %tmp6034 = getelementptr inbounds float* %tmp6033, i64 1
+  %tmp6035 = getelementptr inbounds float* %tmp6034, i64 1
+  %tmp6036 = getelementptr inbounds float* %tmp6035, i64 1
+  %tmp6037 = getelementptr inbounds float* %tmp6036, i64 1
+  %tmp6038 = getelementptr inbounds float* %tmp6037, i64 1
+  %tmp6039 = getelementptr inbounds float* %tmp6038, i64 1
+  %tmp6040 = getelementptr inbounds float* %tmp6039, i64 1
+  %tmp6041 = getelementptr inbounds float* %tmp6040, i64 1
+  %tmp6042 = getelementptr inbounds float* %tmp6041, i64 1
+  %tmp6043 = getelementptr inbounds float* %tmp6042, i64 1
+  %tmp6044 = getelementptr inbounds float* %tmp6043, i64 1
+  %tmp6045 = getelementptr inbounds float* %tmp6044, i64 1
+  %tmp6046 = getelementptr inbounds float* %tmp6045, i64 1
+  %tmp6047 = getelementptr inbounds float* %tmp6046, i64 1
+  %tmp6048 = getelementptr inbounds float* %tmp6047, i64 1
+  %tmp6049 = getelementptr inbounds float* %tmp6048, i64 1
+  %tmp6050 = getelementptr inbounds float* %tmp6049, i64 1
+  %tmp6051 = getelementptr inbounds float* %tmp6050, i64 1
+  %tmp6052 = getelementptr inbounds float* %tmp6051, i64 1
+  %tmp6053 = getelementptr inbounds float* %tmp6052, i64 1
+  %tmp6054 = getelementptr inbounds float* %tmp6053, i64 1
+  %tmp6055 = getelementptr inbounds float* %tmp6054, i64 1
+  %tmp6056 = getelementptr inbounds float* %tmp6055, i64 1
+  %tmp6057 = getelementptr inbounds float* %tmp6056, i64 1
+  %tmp6058 = getelementptr inbounds float* %tmp6057, i64 1
+  %tmp6059 = getelementptr inbounds float* %tmp6058, i64 1
+  %tmp6060 = getelementptr inbounds float* %tmp6059, i64 1
+  %tmp6061 = getelementptr inbounds float* %tmp6060, i64 1
+  %tmp6062 = getelementptr inbounds float* %tmp6061, i64 1
+  %tmp6063 = getelementptr inbounds float* %tmp6062, i64 1
+  %tmp6064 = getelementptr inbounds float* %tmp6063, i64 1
+  %tmp6065 = getelementptr inbounds float* %tmp6064, i64 1
+  %tmp6066 = getelementptr inbounds float* %tmp6065, i64 1
+  %tmp6067 = getelementptr inbounds float* %tmp6066, i64 1
+  %tmp6068 = getelementptr inbounds float* %tmp6067, i64 1
+  %tmp6069 = getelementptr inbounds float* %tmp6068, i64 1
+  %tmp6070 = getelementptr inbounds float* %tmp6069, i64 1
+  %tmp6071 = getelementptr inbounds float* %tmp6070, i64 1
+  %tmp6072 = getelementptr inbounds float* %tmp6071, i64 1
+  %tmp6073 = getelementptr inbounds float* %tmp6072, i64 1
+  %tmp6074 = getelementptr inbounds float* %tmp6073, i64 1
+  %tmp6075 = getelementptr inbounds float* %tmp6074, i64 1
+  %tmp6076 = getelementptr inbounds float* %tmp6075, i64 1
+  %tmp6077 = getelementptr inbounds float* %tmp6076, i64 1
+  %tmp6078 = getelementptr inbounds float* %tmp6077, i64 1
+  %tmp6079 = getelementptr inbounds float* %tmp6078, i64 1
+  %tmp6080 = getelementptr inbounds float* %tmp6079, i64 1
+  %tmp6081 = getelementptr inbounds float* %tmp6080, i64 1
+  %tmp6082 = getelementptr inbounds float* %tmp6081, i64 1
+  %tmp6083 = getelementptr inbounds float* %tmp6082, i64 1
+  %tmp6084 = getelementptr inbounds float* %tmp6083, i64 1
+  %tmp6085 = getelementptr inbounds float* %tmp6084, i64 1
+  %tmp6086 = getelementptr inbounds float* %tmp6085, i64 1
+  %tmp6087 = getelementptr inbounds float* %tmp6086, i64 1
+  %tmp6088 = getelementptr inbounds float* %tmp6087, i64 1
+  %tmp6089 = getelementptr inbounds float* %tmp6088, i64 1
+  %tmp6090 = getelementptr inbounds float* %tmp6089, i64 1
+  %tmp6091 = getelementptr inbounds float* %tmp6090, i64 1
+  %tmp6092 = getelementptr inbounds float* %tmp6091, i64 1
+  %tmp6093 = getelementptr inbounds float* %tmp6092, i64 1
+  %tmp6094 = getelementptr inbounds float* %tmp6093, i64 1
+  %tmp6095 = getelementptr inbounds float* %tmp6094, i64 1
+  %tmp6096 = getelementptr inbounds float* %tmp6095, i64 1
+  %tmp6097 = getelementptr inbounds float* %tmp6096, i64 1
+  %tmp6098 = getelementptr inbounds float* %tmp6097, i64 1
+  %tmp6099 = getelementptr inbounds float* %tmp6098, i64 1
+  %tmp6100 = getelementptr inbounds float* %tmp6099, i64 1
+  %tmp6101 = getelementptr inbounds float* %tmp6100, i64 1
+  %tmp6102 = getelementptr inbounds float* %tmp6101, i64 1
+  %tmp6103 = getelementptr inbounds float* %tmp6102, i64 1
+  %tmp6104 = getelementptr inbounds float* %tmp6103, i64 1
+  %tmp6105 = getelementptr inbounds float* %tmp6104, i64 1
+  %tmp6106 = getelementptr inbounds float* %tmp6105, i64 1
+  %tmp6107 = getelementptr inbounds float* %tmp6106, i64 1
+  %tmp6108 = getelementptr inbounds float* %tmp6107, i64 1
+  %tmp6109 = getelementptr inbounds float* %tmp6108, i64 1
+  %tmp6110 = getelementptr inbounds float* %tmp6109, i64 1
+  %tmp6111 = getelementptr inbounds float* %tmp6110, i64 1
+  %tmp6112 = getelementptr inbounds float* %tmp6111, i64 1
+  %tmp6113 = getelementptr inbounds float* %tmp6112, i64 1
+  %tmp6114 = getelementptr inbounds float* %tmp6113, i64 1
+  %tmp6115 = getelementptr inbounds float* %tmp6114, i64 1
+  %tmp6116 = getelementptr inbounds float* %tmp6115, i64 1
+  %tmp6117 = getelementptr inbounds float* %tmp6116, i64 1
+  %tmp6118 = getelementptr inbounds float* %tmp6117, i64 1
+  %tmp6119 = getelementptr inbounds float* %tmp6118, i64 1
+  %tmp6120 = getelementptr inbounds float* %tmp6119, i64 1
+  %tmp6121 = getelementptr inbounds float* %tmp6120, i64 1
+  %tmp6122 = getelementptr inbounds float* %tmp6121, i64 1
+  %tmp6123 = getelementptr inbounds float* %tmp6122, i64 1
+  %tmp6124 = getelementptr inbounds float* %tmp6123, i64 1
+  %tmp6125 = getelementptr inbounds float* %tmp6124, i64 1
+  %tmp6126 = getelementptr inbounds float* %tmp6125, i64 1
+  %tmp6127 = getelementptr inbounds float* %tmp6126, i64 1
+  %tmp6128 = getelementptr inbounds float* %tmp6127, i64 1
+  %tmp6129 = getelementptr inbounds float* %tmp6128, i64 1
+  %tmp6130 = getelementptr inbounds float* %tmp6129, i64 1
+  %tmp6131 = getelementptr inbounds float* %tmp6130, i64 1
+  %tmp6132 = getelementptr inbounds float* %tmp6131, i64 1
+  %tmp6133 = getelementptr inbounds float* %tmp6132, i64 1
+  %tmp6134 = getelementptr inbounds float* %tmp6133, i64 1
+  %tmp6135 = getelementptr inbounds float* %tmp6134, i64 1
+  %tmp6136 = getelementptr inbounds float* %tmp6135, i64 1
+  %tmp6137 = getelementptr inbounds float* %tmp6136, i64 1
+  %tmp6138 = getelementptr inbounds float* %tmp6137, i64 1
+  %tmp6139 = getelementptr inbounds float* %tmp6138, i64 1
+  %tmp6140 = getelementptr inbounds float* %tmp6139, i64 1
+  %tmp6141 = getelementptr inbounds float* %tmp6140, i64 1
+  %tmp6142 = getelementptr inbounds float* %tmp6141, i64 1
+  %tmp6143 = getelementptr inbounds float* %tmp6142, i64 1
+  %tmp6144 = getelementptr inbounds float* %tmp6143, i64 1
+  %tmp6145 = getelementptr inbounds float* %tmp6144, i64 1
+  %tmp6146 = getelementptr inbounds float* %tmp6145, i64 1
+  %tmp6147 = getelementptr inbounds float* %tmp6146, i64 1
+  %tmp6148 = getelementptr inbounds float* %tmp6147, i64 1
+  %tmp6149 = getelementptr inbounds float* %tmp6148, i64 1
+  %tmp6150 = getelementptr inbounds float* %tmp6149, i64 1
+  %tmp6151 = getelementptr inbounds float* %tmp6150, i64 1
+  %tmp6152 = getelementptr inbounds float* %tmp6151, i64 1
+  %tmp6153 = getelementptr inbounds float* %tmp6152, i64 1
+  %tmp6154 = getelementptr inbounds float* %tmp6153, i64 1
+  %tmp6155 = getelementptr inbounds float* %tmp6154, i64 1
+  %tmp6156 = getelementptr inbounds float* %tmp6155, i64 1
+  %tmp6157 = getelementptr inbounds float* %tmp6156, i64 1
+  %tmp6158 = getelementptr inbounds float* %tmp6157, i64 1
+  %tmp6159 = getelementptr inbounds float* %tmp6158, i64 1
+  %tmp6160 = getelementptr inbounds float* %tmp6159, i64 1
+  %tmp6161 = getelementptr inbounds float* %tmp6160, i64 1
+  %tmp6162 = getelementptr inbounds float* %tmp6161, i64 1
+  %tmp6163 = getelementptr inbounds float* %tmp6162, i64 1
+  %tmp6164 = getelementptr inbounds float* %tmp6163, i64 1
+  %tmp6165 = getelementptr inbounds float* %tmp6164, i64 1
+  %tmp6166 = getelementptr inbounds float* %tmp6165, i64 1
+  %tmp6167 = getelementptr inbounds float* %tmp6166, i64 1
+  %tmp6168 = getelementptr inbounds float* %tmp6167, i64 1
+  %tmp6169 = getelementptr inbounds float* %tmp6168, i64 1
+  %tmp6170 = getelementptr inbounds float* %tmp6169, i64 1
+  %tmp6171 = getelementptr inbounds float* %tmp6170, i64 1
+  %tmp6172 = getelementptr inbounds float* %tmp6171, i64 1
+  %tmp6173 = getelementptr inbounds float* %tmp6172, i64 1
+  %tmp6174 = getelementptr inbounds float* %tmp6173, i64 1
+  %tmp6175 = getelementptr inbounds float* %tmp6174, i64 1
+  %tmp6176 = getelementptr inbounds float* %tmp6175, i64 1
+  %tmp6177 = getelementptr inbounds float* %tmp6176, i64 1
+  %tmp6178 = getelementptr inbounds float* %tmp6177, i64 1
+  %tmp6179 = getelementptr inbounds float* %tmp6178, i64 1
+  %tmp6180 = getelementptr inbounds float* %tmp6179, i64 1
+  %tmp6181 = getelementptr inbounds float* %tmp6180, i64 1
+  %tmp6182 = getelementptr inbounds float* %tmp6181, i64 1
+  %tmp6183 = getelementptr inbounds float* %tmp6182, i64 1
+  %tmp6184 = getelementptr inbounds float* %tmp6183, i64 1
+  %tmp6185 = getelementptr inbounds float* %tmp6184, i64 1
+  %tmp6186 = getelementptr inbounds float* %tmp6185, i64 1
+  %tmp6187 = getelementptr inbounds float* %tmp6186, i64 1
+  %tmp6188 = getelementptr inbounds float* %tmp6187, i64 1
+  %tmp6189 = getelementptr inbounds float* %tmp6188, i64 1
+  %tmp6190 = getelementptr inbounds float* %tmp6189, i64 1
+  %tmp6191 = getelementptr inbounds float* %tmp6190, i64 1
+  %tmp6192 = getelementptr inbounds float* %tmp6191, i64 1
+  %tmp6193 = getelementptr inbounds float* %tmp6192, i64 1
+  %tmp6194 = getelementptr inbounds float* %tmp6193, i64 1
+  %tmp6195 = getelementptr inbounds float* %tmp6194, i64 1
+  %tmp6196 = getelementptr inbounds float* %tmp6195, i64 1
+  %tmp6197 = getelementptr inbounds float* %tmp6196, i64 1
+  %tmp6198 = getelementptr inbounds float* %tmp6197, i64 1
+  %tmp6199 = getelementptr inbounds float* %tmp6198, i64 1
+  %tmp6200 = getelementptr inbounds float* %tmp6199, i64 1
+  %tmp6201 = getelementptr inbounds float* %tmp6200, i64 1
+  %tmp6202 = getelementptr inbounds float* %tmp6201, i64 1
+  %tmp6203 = getelementptr inbounds float* %tmp6202, i64 1
+  %tmp6204 = getelementptr inbounds float* %tmp6203, i64 1
+  %tmp6205 = getelementptr inbounds float* %tmp6204, i64 1
+  %tmp6206 = getelementptr inbounds float* %tmp6205, i64 1
+  %tmp6207 = getelementptr inbounds float* %tmp6206, i64 1
+  %tmp6208 = getelementptr inbounds float* %tmp6207, i64 1
+  %tmp6209 = getelementptr inbounds float* %tmp6208, i64 1
+  %tmp6210 = getelementptr inbounds float* %tmp6209, i64 1
+  %tmp6211 = getelementptr inbounds float* %tmp6210, i64 1
+  %tmp6212 = getelementptr inbounds float* %tmp6211, i64 1
+  %tmp6213 = getelementptr inbounds float* %tmp6212, i64 1
+  %tmp6214 = getelementptr inbounds float* %tmp6213, i64 1
+  %tmp6215 = getelementptr inbounds float* %tmp6214, i64 1
+  %tmp6216 = getelementptr inbounds float* %tmp6215, i64 1
+  %tmp6217 = getelementptr inbounds float* %tmp6216, i64 1
+  %tmp6218 = getelementptr inbounds float* %tmp6217, i64 1
+  %tmp6219 = getelementptr inbounds float* %tmp6218, i64 1
+  %tmp6220 = getelementptr inbounds float* %tmp6219, i64 1
+  %tmp6221 = getelementptr inbounds float* %tmp6220, i64 1
+  %tmp6222 = getelementptr inbounds float* %tmp6221, i64 1
+  %tmp6223 = getelementptr inbounds float* %tmp6222, i64 1
+  %tmp6224 = getelementptr inbounds float* %tmp6223, i64 1
+  %tmp6225 = getelementptr inbounds float* %tmp6224, i64 1
+  %tmp6226 = getelementptr inbounds float* %tmp6225, i64 1
+  %tmp6227 = getelementptr inbounds float* %tmp6226, i64 1
+  %tmp6228 = getelementptr inbounds float* %tmp6227, i64 1
+  %tmp6229 = getelementptr inbounds float* %tmp6228, i64 1
+  %tmp6230 = getelementptr inbounds float* %tmp6229, i64 1
+  %tmp6231 = getelementptr inbounds float* %tmp6230, i64 1
+  %tmp6232 = getelementptr inbounds float* %tmp6231, i64 1
+  %tmp6233 = getelementptr inbounds float* %tmp6232, i64 1
+  %tmp6234 = getelementptr inbounds float* %tmp6233, i64 1
+  %tmp6235 = getelementptr inbounds float* %tmp6234, i64 1
+  %tmp6236 = getelementptr inbounds float* %tmp6235, i64 1
+  %tmp6237 = getelementptr inbounds float* %tmp6236, i64 1
+  %tmp6238 = getelementptr inbounds float* %tmp6237, i64 1
+  %tmp6239 = getelementptr inbounds float* %tmp6238, i64 1
+  %tmp6240 = getelementptr inbounds float* %tmp6239, i64 1
+  %tmp6241 = getelementptr inbounds float* %tmp6240, i64 1
+  %tmp6242 = getelementptr inbounds float* %tmp6241, i64 1
+  %tmp6243 = getelementptr inbounds float* %tmp6242, i64 1
+  %tmp6244 = getelementptr inbounds float* %tmp6243, i64 1
+  %tmp6245 = getelementptr inbounds float* %tmp6244, i64 1
+  %tmp6246 = getelementptr inbounds float* %tmp6245, i64 1
+  %tmp6247 = getelementptr inbounds float* %tmp6246, i64 1
+  %tmp6248 = getelementptr inbounds float* %tmp6247, i64 1
+  %tmp6249 = getelementptr inbounds float* %tmp6248, i64 1
+  %tmp6250 = getelementptr inbounds float* %tmp6249, i64 1
+  %tmp6251 = getelementptr inbounds float* %tmp6250, i64 1
+  %tmp6252 = getelementptr inbounds float* %tmp6251, i64 1
+  %tmp6253 = getelementptr inbounds float* %tmp6252, i64 1
+  %tmp6254 = getelementptr inbounds float* %tmp6253, i64 1
+  %tmp6255 = getelementptr inbounds float* %tmp6254, i64 1
+  %tmp6256 = getelementptr inbounds float* %tmp6255, i64 1
+  %tmp6257 = getelementptr inbounds float* %tmp6256, i64 1
+  %tmp6258 = getelementptr inbounds float* %tmp6257, i64 1
+  %tmp6259 = getelementptr inbounds float* %tmp6258, i64 1
+  %tmp6260 = getelementptr inbounds float* %tmp6259, i64 1
+  %tmp6261 = getelementptr inbounds float* %tmp6260, i64 1
+  %tmp6262 = getelementptr inbounds float* %tmp6261, i64 1
+  %tmp6263 = getelementptr inbounds float* %tmp6262, i64 1
+  %tmp6264 = getelementptr inbounds float* %tmp6263, i64 1
+  %tmp6265 = getelementptr inbounds float* %tmp6264, i64 1
+  %tmp6266 = getelementptr inbounds float* %tmp6265, i64 1
+  %tmp6267 = getelementptr inbounds float* %tmp6266, i64 1
+  %tmp6268 = getelementptr inbounds float* %tmp6267, i64 1
+  %tmp6269 = getelementptr inbounds float* %tmp6268, i64 1
+  %tmp6270 = getelementptr inbounds float* %tmp6269, i64 1
+  %tmp6271 = getelementptr inbounds float* %tmp6270, i64 1
+  %tmp6272 = getelementptr inbounds float* %tmp6271, i64 1
+  %tmp6273 = getelementptr inbounds float* %tmp6272, i64 1
+  %tmp6274 = getelementptr inbounds float* %tmp6273, i64 1
+  %tmp6275 = getelementptr inbounds float* %tmp6274, i64 1
+  %tmp6276 = getelementptr inbounds float* %tmp6275, i64 1
+  %tmp6277 = getelementptr inbounds float* %tmp6276, i64 1
+  %tmp6278 = getelementptr inbounds float* %tmp6277, i64 1
+  %tmp6279 = getelementptr inbounds float* %tmp6278, i64 1
+  %tmp6280 = getelementptr inbounds float* %tmp6279, i64 1
+  %tmp6281 = getelementptr inbounds float* %tmp6280, i64 1
+  %tmp6282 = getelementptr inbounds float* %tmp6281, i64 1
+  %tmp6283 = getelementptr inbounds float* %tmp6282, i64 1
+  %tmp6284 = getelementptr inbounds float* %tmp6283, i64 1
+  %tmp6285 = getelementptr inbounds float* %tmp6284, i64 1
+  %tmp6286 = getelementptr inbounds float* %tmp6285, i64 1
+  %tmp6287 = getelementptr inbounds float* %tmp6286, i64 1
+  %tmp6288 = getelementptr inbounds float* %tmp6287, i64 1
+  %tmp6289 = getelementptr inbounds float* %tmp6288, i64 1
+  %tmp6290 = getelementptr inbounds float* %tmp6289, i64 1
+  %tmp6291 = getelementptr inbounds float* %tmp6290, i64 1
+  %tmp6292 = getelementptr inbounds float* %tmp6291, i64 1
+  %tmp6293 = getelementptr inbounds float* %tmp6292, i64 1
+  %tmp6294 = getelementptr inbounds float* %tmp6293, i64 1
+  %tmp6295 = getelementptr inbounds float* %tmp6294, i64 1
+  %tmp6296 = getelementptr inbounds float* %tmp6295, i64 1
+  %tmp6297 = getelementptr inbounds float* %tmp6296, i64 1
+  %tmp6298 = getelementptr inbounds float* %tmp6297, i64 1
+  %tmp6299 = getelementptr inbounds float* %tmp6298, i64 1
+  %tmp6300 = getelementptr inbounds float* %tmp6299, i64 1
+  %tmp6301 = getelementptr inbounds float* %tmp6300, i64 1
+  %tmp6302 = getelementptr inbounds float* %tmp6301, i64 1
+  %tmp6303 = getelementptr inbounds float* %tmp6302, i64 1
+  %tmp6304 = getelementptr inbounds float* %tmp6303, i64 1
+  %tmp6305 = getelementptr inbounds float* %tmp6304, i64 1
+  %tmp6306 = getelementptr inbounds float* %tmp6305, i64 1
+  %tmp6307 = getelementptr inbounds float* %tmp6306, i64 1
+  %tmp6308 = getelementptr inbounds float* %tmp6307, i64 1
+  %tmp6309 = getelementptr inbounds float* %tmp6308, i64 1
+  %tmp6310 = getelementptr inbounds float* %tmp6309, i64 1
+  %tmp6311 = getelementptr inbounds float* %tmp6310, i64 1
+  %tmp6312 = getelementptr inbounds float* %tmp6311, i64 1
+  %tmp6313 = getelementptr inbounds float* %tmp6312, i64 1
+  %tmp6314 = getelementptr inbounds float* %tmp6313, i64 1
+  %tmp6315 = getelementptr inbounds float* %tmp6314, i64 1
+  %tmp6316 = getelementptr inbounds float* %tmp6315, i64 1
+  %tmp6317 = getelementptr inbounds float* %tmp6316, i64 1
+  %tmp6318 = getelementptr inbounds float* %tmp6317, i64 1
+  %tmp6319 = getelementptr inbounds float* %tmp6318, i64 1
+  %tmp6320 = getelementptr inbounds float* %tmp6319, i64 1
+  %tmp6321 = getelementptr inbounds float* %tmp6320, i64 1
+  %tmp6322 = getelementptr inbounds float* %tmp6321, i64 1
+  %tmp6323 = getelementptr inbounds float* %tmp6322, i64 1
+  %tmp6324 = getelementptr inbounds float* %tmp6323, i64 1
+  %tmp6325 = getelementptr inbounds float* %tmp6324, i64 1
+  %tmp6326 = getelementptr inbounds float* %tmp6325, i64 1
+  %tmp6327 = getelementptr inbounds float* %tmp6326, i64 1
+  %tmp6328 = getelementptr inbounds float* %tmp6327, i64 1
+  %tmp6329 = getelementptr inbounds float* %tmp6328, i64 1
+  %tmp6330 = getelementptr inbounds float* %tmp6329, i64 1
+  %tmp6331 = getelementptr inbounds float* %tmp6330, i64 1
+  %tmp6332 = getelementptr inbounds float* %tmp6331, i64 1
+  %tmp6333 = getelementptr inbounds float* %tmp6332, i64 1
+  %tmp6334 = getelementptr inbounds float* %tmp6333, i64 1
+  %tmp6335 = getelementptr inbounds float* %tmp6334, i64 1
+  %tmp6336 = getelementptr inbounds float* %tmp6335, i64 1
+  %tmp6337 = getelementptr inbounds float* %tmp6336, i64 1
+  %tmp6338 = getelementptr inbounds float* %tmp6337, i64 1
+  %tmp6339 = getelementptr inbounds float* %tmp6338, i64 1
+  %tmp6340 = getelementptr inbounds float* %tmp6339, i64 1
+  %tmp6341 = getelementptr inbounds float* %tmp6340, i64 1
+  %tmp6342 = getelementptr inbounds float* %tmp6341, i64 1
+  %tmp6343 = getelementptr inbounds float* %tmp6342, i64 1
+  %tmp6344 = getelementptr inbounds float* %tmp6343, i64 1
+  %tmp6345 = getelementptr inbounds float* %tmp6344, i64 1
+  %tmp6346 = getelementptr inbounds float* %tmp6345, i64 1
+  %tmp6347 = getelementptr inbounds float* %tmp6346, i64 1
+  %tmp6348 = getelementptr inbounds float* %tmp6347, i64 1
+  %tmp6349 = getelementptr inbounds float* %tmp6348, i64 1
+  %tmp6350 = getelementptr inbounds float* %tmp6349, i64 1
+  %tmp6351 = getelementptr inbounds float* %tmp6350, i64 1
+  %tmp6352 = getelementptr inbounds float* %tmp6351, i64 1
+  %tmp6353 = getelementptr inbounds float* %tmp6352, i64 1
+  %tmp6354 = getelementptr inbounds float* %tmp6353, i64 1
+  %tmp6355 = getelementptr inbounds float* %tmp6354, i64 1
+  %tmp6356 = getelementptr inbounds float* %tmp6355, i64 1
+  %tmp6357 = getelementptr inbounds float* %tmp6356, i64 1
+  %tmp6358 = getelementptr inbounds float* %tmp6357, i64 1
+  %tmp6359 = getelementptr inbounds float* %tmp6358, i64 1
+  %tmp6360 = getelementptr inbounds float* %tmp6359, i64 1
+  %tmp6361 = getelementptr inbounds float* %tmp6360, i64 1
+  %tmp6362 = getelementptr inbounds float* %tmp6361, i64 1
+  %tmp6363 = getelementptr inbounds float* %tmp6362, i64 1
+  %tmp6364 = getelementptr inbounds float* %tmp6363, i64 1
+  %tmp6365 = getelementptr inbounds float* %tmp6364, i64 1
+  %tmp6366 = getelementptr inbounds float* %tmp6365, i64 1
+  %tmp6367 = getelementptr inbounds float* %tmp6366, i64 1
+  %tmp6368 = getelementptr inbounds float* %tmp6367, i64 1
+  %tmp6369 = getelementptr inbounds float* %tmp6368, i64 1
+  %tmp6370 = getelementptr inbounds float* %tmp6369, i64 1
+  %tmp6371 = getelementptr inbounds float* %tmp6370, i64 1
+  %tmp6372 = getelementptr inbounds float* %tmp6371, i64 1
+  %tmp6373 = getelementptr inbounds float* %tmp6372, i64 1
+  %tmp6374 = getelementptr inbounds float* %tmp6373, i64 1
+  %tmp6375 = getelementptr inbounds float* %tmp6374, i64 1
+  %tmp6376 = getelementptr inbounds float* %tmp6375, i64 1
+  %tmp6377 = getelementptr inbounds float* %tmp6376, i64 1
+  %tmp6378 = getelementptr inbounds float* %tmp6377, i64 1
+  %tmp6379 = getelementptr inbounds float* %tmp6378, i64 1
+  %tmp6380 = getelementptr inbounds float* %tmp6379, i64 1
+  %tmp6381 = getelementptr inbounds float* %tmp6380, i64 1
+  %tmp6382 = getelementptr inbounds float* %tmp6381, i64 1
+  %tmp6383 = getelementptr inbounds float* %tmp6382, i64 1
+  %tmp6384 = getelementptr inbounds float* %tmp6383, i64 1
+  %tmp6385 = getelementptr inbounds float* %tmp6384, i64 1
+  %tmp6386 = getelementptr inbounds float* %tmp6385, i64 1
+  %tmp6387 = getelementptr inbounds float* %tmp6386, i64 1
+  %tmp6388 = getelementptr inbounds float* %tmp6387, i64 1
+  %tmp6389 = getelementptr inbounds float* %tmp6388, i64 1
+  %tmp6390 = getelementptr inbounds float* %tmp6389, i64 1
+  %tmp6391 = getelementptr inbounds float* %tmp6390, i64 1
+  %tmp6392 = getelementptr inbounds float* %tmp6391, i64 1
+  %tmp6393 = getelementptr inbounds float* %tmp6392, i64 1
+  %tmp6394 = getelementptr inbounds float* %tmp6393, i64 1
+  %tmp6395 = getelementptr inbounds float* %tmp6394, i64 1
+  %tmp6396 = getelementptr inbounds float* %tmp6395, i64 1
+  %tmp6397 = getelementptr inbounds float* %tmp6396, i64 1
+  %tmp6398 = getelementptr inbounds float* %tmp6397, i64 1
+  %tmp6399 = getelementptr inbounds float* %tmp6398, i64 1
+  %tmp6400 = getelementptr inbounds float* %tmp6399, i64 1
+  %tmp6401 = getelementptr inbounds float* %tmp6400, i64 1
+  %tmp6402 = getelementptr inbounds float* %tmp6401, i64 1
+  %tmp6403 = getelementptr inbounds float* %tmp6402, i64 1
+  %tmp6404 = getelementptr inbounds float* %tmp6403, i64 1
+  %tmp6405 = getelementptr inbounds float* %tmp6404, i64 1
+  %tmp6406 = getelementptr inbounds float* %tmp6405, i64 1
+  %tmp6407 = getelementptr inbounds float* %tmp6406, i64 1
+  %tmp6408 = getelementptr inbounds float* %tmp6407, i64 1
+  %tmp6409 = getelementptr inbounds float* %tmp6408, i64 1
+  %tmp6410 = getelementptr inbounds float* %tmp6409, i64 1
+  %tmp6411 = getelementptr inbounds float* %tmp6410, i64 1
+  %tmp6412 = getelementptr inbounds float* %tmp6411, i64 1
+  %tmp6413 = getelementptr inbounds float* %tmp6412, i64 1
+  %tmp6414 = getelementptr inbounds float* %tmp6413, i64 1
+  %tmp6415 = getelementptr inbounds float* %tmp6414, i64 1
+  %tmp6416 = getelementptr inbounds float* %tmp6415, i64 1
+  %tmp6417 = getelementptr inbounds float* %tmp6416, i64 1
+  %tmp6418 = getelementptr inbounds float* %tmp6417, i64 1
+  %tmp6419 = getelementptr inbounds float* %tmp6418, i64 1
+  %tmp6420 = getelementptr inbounds float* %tmp6419, i64 1
+  %tmp6421 = getelementptr inbounds float* %tmp6420, i64 1
+  %tmp6422 = getelementptr inbounds float* %tmp6421, i64 1
+  %tmp6423 = getelementptr inbounds float* %tmp6422, i64 1
+  %tmp6424 = getelementptr inbounds float* %tmp6423, i64 1
+  %tmp6425 = getelementptr inbounds float* %tmp6424, i64 1
+  %tmp6426 = getelementptr inbounds float* %tmp6425, i64 1
+  %tmp6427 = getelementptr inbounds float* %tmp6426, i64 1
+  %tmp6428 = getelementptr inbounds float* %tmp6427, i64 1
+  %tmp6429 = getelementptr inbounds float* %tmp6428, i64 1
+  %tmp6430 = getelementptr inbounds float* %tmp6429, i64 1
+  %tmp6431 = getelementptr inbounds float* %tmp6430, i64 1
+  %tmp6432 = getelementptr inbounds float* %tmp6431, i64 1
+  %tmp6433 = getelementptr inbounds float* %tmp6432, i64 1
+  %tmp6434 = getelementptr inbounds float* %tmp6433, i64 1
+  %tmp6435 = getelementptr inbounds float* %tmp6434, i64 1
+  %tmp6436 = getelementptr inbounds float* %tmp6435, i64 1
+  %tmp6437 = getelementptr inbounds float* %tmp6436, i64 1
+  %tmp6438 = getelementptr inbounds float* %tmp6437, i64 1
+  %tmp6439 = getelementptr inbounds float* %tmp6438, i64 1
+  %tmp6440 = getelementptr inbounds float* %tmp6439, i64 1
+  %tmp6441 = getelementptr inbounds float* %tmp6440, i64 1
+  %tmp6442 = getelementptr inbounds float* %tmp6441, i64 1
+  %tmp6443 = getelementptr inbounds float* %tmp6442, i64 1
+  %tmp6444 = getelementptr inbounds float* %tmp6443, i64 1
+  %tmp6445 = getelementptr inbounds float* %tmp6444, i64 1
+  %tmp6446 = getelementptr inbounds float* %tmp6445, i64 1
+  %tmp6447 = getelementptr inbounds float* %tmp6446, i64 1
+  %tmp6448 = getelementptr inbounds float* %tmp6447, i64 1
+  %tmp6449 = getelementptr inbounds float* %tmp6448, i64 1
+  %tmp6450 = getelementptr inbounds float* %tmp6449, i64 1
+  %tmp6451 = getelementptr inbounds float* %tmp6450, i64 1
+  %tmp6452 = getelementptr inbounds float* %tmp6451, i64 1
+  %tmp6453 = getelementptr inbounds float* %tmp6452, i64 1
+  %tmp6454 = getelementptr inbounds float* %tmp6453, i64 1
+  %tmp6455 = getelementptr inbounds float* %tmp6454, i64 1
+  %tmp6456 = getelementptr inbounds float* %tmp6455, i64 1
+  %tmp6457 = getelementptr inbounds float* %tmp6456, i64 1
+  %tmp6458 = getelementptr inbounds float* %tmp6457, i64 1
+  %tmp6459 = getelementptr inbounds float* %tmp6458, i64 1
+  %tmp6460 = getelementptr inbounds float* %tmp6459, i64 1
+  %tmp6461 = getelementptr inbounds float* %tmp6460, i64 1
+  %tmp6462 = getelementptr inbounds float* %tmp6461, i64 1
+  %tmp6463 = getelementptr inbounds float* %tmp6462, i64 1
+  %tmp6464 = getelementptr inbounds float* %tmp6463, i64 1
+  %tmp6465 = getelementptr inbounds float* %tmp6464, i64 1
+  %tmp6466 = getelementptr inbounds float* %tmp6465, i64 1
+  %tmp6467 = getelementptr inbounds float* %tmp6466, i64 1
+  %tmp6468 = getelementptr inbounds float* %tmp6467, i64 1
+  %tmp6469 = getelementptr inbounds float* %tmp6468, i64 1
+  %tmp6470 = getelementptr inbounds float* %tmp6469, i64 1
+  %tmp6471 = getelementptr inbounds float* %tmp6470, i64 1
+  %tmp6472 = getelementptr inbounds float* %tmp6471, i64 1
+  %tmp6473 = getelementptr inbounds float* %tmp6472, i64 1
+  %tmp6474 = getelementptr inbounds float* %tmp6473, i64 1
+  %tmp6475 = getelementptr inbounds float* %tmp6474, i64 1
+  %tmp6476 = getelementptr inbounds float* %tmp6475, i64 1
+  %tmp6477 = getelementptr inbounds float* %tmp6476, i64 1
+  %tmp6478 = getelementptr inbounds float* %tmp6477, i64 1
+  %tmp6479 = getelementptr inbounds float* %tmp6478, i64 1
+  %tmp6480 = getelementptr inbounds float* %tmp6479, i64 1
+  %tmp6481 = getelementptr inbounds float* %tmp6480, i64 1
+  %tmp6482 = getelementptr inbounds float* %tmp6481, i64 1
+  %tmp6483 = getelementptr inbounds float* %tmp6482, i64 1
+  %tmp6484 = getelementptr inbounds float* %tmp6483, i64 1
+  %tmp6485 = getelementptr inbounds float* %tmp6484, i64 1
+  %tmp6486 = getelementptr inbounds float* %tmp6485, i64 1
+  %tmp6487 = getelementptr inbounds float* %tmp6486, i64 1
+  %tmp6488 = getelementptr inbounds float* %tmp6487, i64 1
+  %tmp6489 = getelementptr inbounds float* %tmp6488, i64 1
+  %tmp6490 = getelementptr inbounds float* %tmp6489, i64 1
+  %tmp6491 = getelementptr inbounds float* %tmp6490, i64 1
+  %tmp6492 = getelementptr inbounds float* %tmp6491, i64 1
+  %tmp6493 = getelementptr inbounds float* %tmp6492, i64 1
+  %tmp6494 = getelementptr inbounds float* %tmp6493, i64 1
+  %tmp6495 = getelementptr inbounds float* %tmp6494, i64 1
+  %tmp6496 = getelementptr inbounds float* %tmp6495, i64 1
+  %tmp6497 = getelementptr inbounds float* %tmp6496, i64 1
+  %tmp6498 = getelementptr inbounds float* %tmp6497, i64 1
+  %tmp6499 = getelementptr inbounds float* %tmp6498, i64 1
+  %tmp6500 = getelementptr inbounds float* %tmp6499, i64 1
+  %tmp6501 = getelementptr inbounds float* %tmp6500, i64 1
+  %tmp6502 = getelementptr inbounds float* %tmp6501, i64 1
+  %tmp6503 = getelementptr inbounds float* %tmp6502, i64 1
+  %tmp6504 = getelementptr inbounds float* %tmp6503, i64 1
+  %tmp6505 = getelementptr inbounds float* %tmp6504, i64 1
+  %tmp6506 = getelementptr inbounds float* %tmp6505, i64 1
+  %tmp6507 = getelementptr inbounds float* %tmp6506, i64 1
+  %tmp6508 = getelementptr inbounds float* %tmp6507, i64 1
+  %tmp6509 = getelementptr inbounds float* %tmp6508, i64 1
+  %tmp6510 = getelementptr inbounds float* %tmp6509, i64 1
+  %tmp6511 = getelementptr inbounds float* %tmp6510, i64 1
+  %tmp6512 = getelementptr inbounds float* %tmp6511, i64 1
+  %tmp6513 = getelementptr inbounds float* %tmp6512, i64 1
+  %tmp6514 = getelementptr inbounds float* %tmp6513, i64 1
+  %tmp6515 = getelementptr inbounds float* %tmp6514, i64 1
+  %tmp6516 = getelementptr inbounds float* %tmp6515, i64 1
+  %tmp6517 = getelementptr inbounds float* %tmp6516, i64 1
+  %tmp6518 = getelementptr inbounds float* %tmp6517, i64 1
+  %tmp6519 = getelementptr inbounds float* %tmp6518, i64 1
+  %tmp6520 = getelementptr inbounds float* %tmp6519, i64 1
+  %tmp6521 = getelementptr inbounds float* %tmp6520, i64 1
+  %tmp6522 = getelementptr inbounds float* %tmp6521, i64 1
+  %tmp6523 = getelementptr inbounds float* %tmp6522, i64 1
+  %tmp6524 = getelementptr inbounds float* %tmp6523, i64 1
+  %tmp6525 = getelementptr inbounds float* %tmp6524, i64 1
+  %tmp6526 = getelementptr inbounds float* %tmp6525, i64 1
+  %tmp6527 = getelementptr inbounds float* %tmp6526, i64 1
+  %tmp6528 = getelementptr inbounds float* %tmp6527, i64 1
+  %tmp6529 = getelementptr inbounds float* %tmp6528, i64 1
+  %tmp6530 = getelementptr inbounds float* %tmp6529, i64 1
+  %tmp6531 = getelementptr inbounds float* %tmp6530, i64 1
+  %tmp6532 = getelementptr inbounds float* %tmp6531, i64 1
+  %tmp6533 = getelementptr inbounds float* %tmp6532, i64 1
+  %tmp6534 = getelementptr inbounds float* %tmp6533, i64 1
+  %tmp6535 = getelementptr inbounds float* %tmp6534, i64 1
+  %tmp6536 = getelementptr inbounds float* %tmp6535, i64 1
+  %tmp6537 = getelementptr inbounds float* %tmp6536, i64 1
+  %tmp6538 = getelementptr inbounds float* %tmp6537, i64 1
+  %tmp6539 = getelementptr inbounds float* %tmp6538, i64 1
+  %tmp6540 = getelementptr inbounds float* %tmp6539, i64 1
+  %tmp6541 = getelementptr inbounds float* %tmp6540, i64 1
+  %tmp6542 = getelementptr inbounds float* %tmp6541, i64 1
+  %tmp6543 = getelementptr inbounds float* %tmp6542, i64 1
+  %tmp6544 = getelementptr inbounds float* %tmp6543, i64 1
+  %tmp6545 = getelementptr inbounds float* %tmp6544, i64 1
+  %tmp6546 = getelementptr inbounds float* %tmp6545, i64 1
+  %tmp6547 = getelementptr inbounds float* %tmp6546, i64 1
+  %tmp6548 = getelementptr inbounds float* %tmp6547, i64 1
+  %tmp6549 = getelementptr inbounds float* %tmp6548, i64 1
+  %tmp6550 = getelementptr inbounds float* %tmp6549, i64 1
+  %tmp6551 = getelementptr inbounds float* %tmp6550, i64 1
+  %tmp6552 = getelementptr inbounds float* %tmp6551, i64 1
+  %tmp6553 = getelementptr inbounds float* %tmp6552, i64 1
+  %tmp6554 = getelementptr inbounds float* %tmp6553, i64 1
+  %tmp6555 = getelementptr inbounds float* %tmp6554, i64 1
+  %tmp6556 = getelementptr inbounds float* %tmp6555, i64 1
+  %tmp6557 = getelementptr inbounds float* %tmp6556, i64 1
+  %tmp6558 = getelementptr inbounds float* %tmp6557, i64 1
+  %tmp6559 = getelementptr inbounds float* %tmp6558, i64 1
+  %tmp6560 = getelementptr inbounds float* %tmp6559, i64 1
+  %tmp6561 = getelementptr inbounds float* %tmp6560, i64 1
+  %tmp6562 = getelementptr inbounds float* %tmp6561, i64 1
+  %tmp6563 = getelementptr inbounds float* %tmp6562, i64 1
+  %tmp6564 = getelementptr inbounds float* %tmp6563, i64 1
+  %tmp6565 = getelementptr inbounds float* %tmp6564, i64 1
+  %tmp6566 = getelementptr inbounds float* %tmp6565, i64 1
+  %tmp6567 = getelementptr inbounds float* %tmp6566, i64 1
+  %tmp6568 = getelementptr inbounds float* %tmp6567, i64 1
+  %tmp6569 = getelementptr inbounds float* %tmp6568, i64 1
+  %tmp6570 = getelementptr inbounds float* %tmp6569, i64 1
+  %tmp6571 = getelementptr inbounds float* %tmp6570, i64 1
+  %tmp6572 = getelementptr inbounds float* %tmp6571, i64 1
+  %tmp6573 = getelementptr inbounds float* %tmp6572, i64 1
+  %tmp6574 = getelementptr inbounds float* %tmp6573, i64 1
+  %tmp6575 = getelementptr inbounds float* %tmp6574, i64 1
+  %tmp6576 = getelementptr inbounds float* %tmp6575, i64 1
+  %tmp6577 = getelementptr inbounds float* %tmp6576, i64 1
+  %tmp6578 = getelementptr inbounds float* %tmp6577, i64 1
+  %tmp6579 = getelementptr inbounds float* %tmp6578, i64 1
+  %tmp6580 = getelementptr inbounds float* %tmp6579, i64 1
+  %tmp6581 = getelementptr inbounds float* %tmp6580, i64 1
+  %tmp6582 = getelementptr inbounds float* %tmp6581, i64 1
+  %tmp6583 = getelementptr inbounds float* %tmp6582, i64 1
+  %tmp6584 = getelementptr inbounds float* %tmp6583, i64 1
+  %tmp6585 = getelementptr inbounds float* %tmp6584, i64 1
+  %tmp6586 = getelementptr inbounds float* %tmp6585, i64 1
+  %tmp6587 = getelementptr inbounds float* %tmp6586, i64 1
+  %tmp6588 = getelementptr inbounds float* %tmp6587, i64 1
+  %tmp6589 = getelementptr inbounds float* %tmp6588, i64 1
+  %tmp6590 = getelementptr inbounds float* %tmp6589, i64 1
+  %tmp6591 = getelementptr inbounds float* %tmp6590, i64 1
+  %tmp6592 = getelementptr inbounds float* %tmp6591, i64 1
+  %tmp6593 = getelementptr inbounds float* %tmp6592, i64 1
+  %tmp6594 = getelementptr inbounds float* %tmp6593, i64 1
+  %tmp6595 = getelementptr inbounds float* %tmp6594, i64 1
+  %tmp6596 = getelementptr inbounds float* %tmp6595, i64 1
+  %tmp6597 = getelementptr inbounds float* %tmp6596, i64 1
+  %tmp6598 = getelementptr inbounds float* %tmp6597, i64 1
+  %tmp6599 = getelementptr inbounds float* %tmp6598, i64 1
+  %tmp6600 = getelementptr inbounds float* %tmp6599, i64 1
+  %tmp6601 = getelementptr inbounds float* %tmp6600, i64 1
+  %tmp6602 = getelementptr inbounds float* %tmp6601, i64 1
+  %tmp6603 = getelementptr inbounds float* %tmp6602, i64 1
+  %tmp6604 = getelementptr inbounds float* %tmp6603, i64 1
+  %tmp6605 = getelementptr inbounds float* %tmp6604, i64 1
+  %tmp6606 = getelementptr inbounds float* %tmp6605, i64 1
+  %tmp6607 = getelementptr inbounds float* %tmp6606, i64 1
+  %tmp6608 = getelementptr inbounds float* %tmp6607, i64 1
+  %tmp6609 = getelementptr inbounds float* %tmp6608, i64 1
+  %tmp6610 = getelementptr inbounds float* %tmp6609, i64 1
+  %tmp6611 = getelementptr inbounds float* %tmp6610, i64 1
+  %tmp6612 = getelementptr inbounds float* %tmp6611, i64 1
+  %tmp6613 = getelementptr inbounds float* %tmp6612, i64 1
+  %tmp6614 = getelementptr inbounds float* %tmp6613, i64 1
+  %tmp6615 = getelementptr inbounds float* %tmp6614, i64 1
+  %tmp6616 = getelementptr inbounds float* %tmp6615, i64 1
+  %tmp6617 = getelementptr inbounds float* %tmp6616, i64 1
+  %tmp6618 = getelementptr inbounds float* %tmp6617, i64 1
+  %tmp6619 = getelementptr inbounds float* %tmp6618, i64 1
+  %tmp6620 = getelementptr inbounds float* %tmp6619, i64 1
+  %tmp6621 = getelementptr inbounds float* %tmp6620, i64 1
+  %tmp6622 = getelementptr inbounds float* %tmp6621, i64 1
+  %tmp6623 = getelementptr inbounds float* %tmp6622, i64 1
+  %tmp6624 = getelementptr inbounds float* %tmp6623, i64 1
+  %tmp6625 = getelementptr inbounds float* %tmp6624, i64 1
+  %tmp6626 = getelementptr inbounds float* %tmp6625, i64 1
+  %tmp6627 = getelementptr inbounds float* %tmp6626, i64 1
+  %tmp6628 = getelementptr inbounds float* %tmp6627, i64 1
+  %tmp6629 = getelementptr inbounds float* %tmp6628, i64 1
+  %tmp6630 = getelementptr inbounds float* %tmp6629, i64 1
+  %tmp6631 = getelementptr inbounds float* %tmp6630, i64 1
+  %tmp6632 = getelementptr inbounds float* %tmp6631, i64 1
+  %tmp6633 = getelementptr inbounds float* %tmp6632, i64 1
+  %tmp6634 = getelementptr inbounds float* %tmp6633, i64 1
+  %tmp6635 = getelementptr inbounds float* %tmp6634, i64 1
+  %tmp6636 = getelementptr inbounds float* %tmp6635, i64 1
+  %tmp6637 = getelementptr inbounds float* %tmp6636, i64 1
+  %tmp6638 = getelementptr inbounds float* %tmp6637, i64 1
+  %tmp6639 = getelementptr inbounds float* %tmp6638, i64 1
+  %tmp6640 = getelementptr inbounds float* %tmp6639, i64 1
+  %tmp6641 = getelementptr inbounds float* %tmp6640, i64 1
+  %tmp6642 = getelementptr inbounds float* %tmp6641, i64 1
+  %tmp6643 = getelementptr inbounds float* %tmp6642, i64 1
+  %tmp6644 = getelementptr inbounds float* %tmp6643, i64 1
+  %tmp6645 = getelementptr inbounds float* %tmp6644, i64 1
+  %tmp6646 = getelementptr inbounds float* %tmp6645, i64 1
+  %tmp6647 = getelementptr inbounds float* %tmp6646, i64 1
+  %tmp6648 = getelementptr inbounds float* %tmp6647, i64 1
+  %tmp6649 = getelementptr inbounds float* %tmp6648, i64 1
+  %tmp6650 = getelementptr inbounds float* %tmp6649, i64 1
+  %tmp6651 = getelementptr inbounds float* %tmp6650, i64 1
+  %tmp6652 = getelementptr inbounds float* %tmp6651, i64 1
+  %tmp6653 = getelementptr inbounds float* %tmp6652, i64 1
+  %tmp6654 = getelementptr inbounds float* %tmp6653, i64 1
+  %tmp6655 = getelementptr inbounds float* %tmp6654, i64 1
+  %tmp6656 = getelementptr inbounds float* %tmp6655, i64 1
+  %tmp6657 = getelementptr inbounds float* %tmp6656, i64 1
+  %tmp6658 = getelementptr inbounds float* %tmp6657, i64 1
+  %tmp6659 = getelementptr inbounds float* %tmp6658, i64 1
+  %tmp6660 = getelementptr inbounds float* %tmp6659, i64 1
+  %tmp6661 = getelementptr inbounds float* %tmp6660, i64 1
+  %tmp6662 = getelementptr inbounds float* %tmp6661, i64 1
+  %tmp6663 = getelementptr inbounds float* %tmp6662, i64 1
+  %tmp6664 = getelementptr inbounds float* %tmp6663, i64 1
+  %tmp6665 = getelementptr inbounds float* %tmp6664, i64 1
+  %tmp6666 = getelementptr inbounds float* %tmp6665, i64 1
+  %tmp6667 = getelementptr inbounds float* %tmp6666, i64 1
+  %tmp6668 = getelementptr inbounds float* %tmp6667, i64 1
+  %tmp6669 = getelementptr inbounds float* %tmp6668, i64 1
+  %tmp6670 = getelementptr inbounds float* %tmp6669, i64 1
+  %tmp6671 = getelementptr inbounds float* %tmp6670, i64 1
+  %tmp6672 = getelementptr inbounds float* %tmp6671, i64 1
+  %tmp6673 = getelementptr inbounds float* %tmp6672, i64 1
+  %tmp6674 = getelementptr inbounds float* %tmp6673, i64 1
+  %tmp6675 = getelementptr inbounds float* %tmp6674, i64 1
+  %tmp6676 = getelementptr inbounds float* %tmp6675, i64 1
+  %tmp6677 = getelementptr inbounds float* %tmp6676, i64 1
+  %tmp6678 = getelementptr inbounds float* %tmp6677, i64 1
+  %tmp6679 = getelementptr inbounds float* %tmp6678, i64 1
+  %tmp6680 = getelementptr inbounds float* %tmp6679, i64 1
+  %tmp6681 = getelementptr inbounds float* %tmp6680, i64 1
+  %tmp6682 = getelementptr inbounds float* %tmp6681, i64 1
+  %tmp6683 = getelementptr inbounds float* %tmp6682, i64 1
+  %tmp6684 = getelementptr inbounds float* %tmp6683, i64 1
+  %tmp6685 = getelementptr inbounds float* %tmp6684, i64 1
+  %tmp6686 = getelementptr inbounds float* %tmp6685, i64 1
+  %tmp6687 = getelementptr inbounds float* %tmp6686, i64 1
+  %tmp6688 = getelementptr inbounds float* %tmp6687, i64 1
+  %tmp6689 = getelementptr inbounds float* %tmp6688, i64 1
+  %tmp6690 = getelementptr inbounds float* %tmp6689, i64 1
+  %tmp6691 = getelementptr inbounds float* %tmp6690, i64 1
+  %tmp6692 = getelementptr inbounds float* %tmp6691, i64 1
+  %tmp6693 = getelementptr inbounds float* %tmp6692, i64 1
+  %tmp6694 = getelementptr inbounds float* %tmp6693, i64 1
+  %tmp6695 = getelementptr inbounds float* %tmp6694, i64 1
+  %tmp6696 = getelementptr inbounds float* %tmp6695, i64 1
+  %tmp6697 = getelementptr inbounds float* %tmp6696, i64 1
+  %tmp6698 = getelementptr inbounds float* %tmp6697, i64 1
+  %tmp6699 = getelementptr inbounds float* %tmp6698, i64 1
+  %tmp6700 = getelementptr inbounds float* %tmp6699, i64 1
+  %tmp6701 = getelementptr inbounds float* %tmp6700, i64 1
+  %tmp6702 = getelementptr inbounds float* %tmp6701, i64 1
+  %tmp6703 = getelementptr inbounds float* %tmp6702, i64 1
+  %tmp6704 = getelementptr inbounds float* %tmp6703, i64 1
+  %tmp6705 = getelementptr inbounds float* %tmp6704, i64 1
+  %tmp6706 = getelementptr inbounds float* %tmp6705, i64 1
+  %tmp6707 = getelementptr inbounds float* %tmp6706, i64 1
+  %tmp6708 = getelementptr inbounds float* %tmp6707, i64 1
+  %tmp6709 = getelementptr inbounds float* %tmp6708, i64 1
+  %tmp6710 = getelementptr inbounds float* %tmp6709, i64 1
+  %tmp6711 = getelementptr inbounds float* %tmp6710, i64 1
+  %tmp6712 = getelementptr inbounds float* %tmp6711, i64 1
+  %tmp6713 = getelementptr inbounds float* %tmp6712, i64 1
+  %tmp6714 = getelementptr inbounds float* %tmp6713, i64 1
+  %tmp6715 = getelementptr inbounds float* %tmp6714, i64 1
+  %tmp6716 = getelementptr inbounds float* %tmp6715, i64 1
+  %tmp6717 = getelementptr inbounds float* %tmp6716, i64 1
+  %tmp6718 = getelementptr inbounds float* %tmp6717, i64 1
+  %tmp6719 = getelementptr inbounds float* %tmp6718, i64 1
+  %tmp6720 = getelementptr inbounds float* %tmp6719, i64 1
+  %tmp6721 = getelementptr inbounds float* %tmp6720, i64 1
+  %tmp6722 = getelementptr inbounds float* %tmp6721, i64 1
+  %tmp6723 = getelementptr inbounds float* %tmp6722, i64 1
+  %tmp6724 = getelementptr inbounds float* %tmp6723, i64 1
+  %tmp6725 = getelementptr inbounds float* %tmp6724, i64 1
+  %tmp6726 = getelementptr inbounds float* %tmp6725, i64 1
+  %tmp6727 = getelementptr inbounds float* %tmp6726, i64 1
+  %tmp6728 = getelementptr inbounds float* %tmp6727, i64 1
+  %tmp6729 = getelementptr inbounds float* %tmp6728, i64 1
+  %tmp6730 = getelementptr inbounds float* %tmp6729, i64 1
+  %tmp6731 = getelementptr inbounds float* %tmp6730, i64 1
+  %tmp6732 = getelementptr inbounds float* %tmp6731, i64 1
+  %tmp6733 = getelementptr inbounds float* %tmp6732, i64 1
+  %tmp6734 = getelementptr inbounds float* %tmp6733, i64 1
+  %tmp6735 = getelementptr inbounds float* %tmp6734, i64 1
+  %tmp6736 = getelementptr inbounds float* %tmp6735, i64 1
+  %tmp6737 = getelementptr inbounds float* %tmp6736, i64 1
+  %tmp6738 = getelementptr inbounds float* %tmp6737, i64 1
+  %tmp6739 = getelementptr inbounds float* %tmp6738, i64 1
+  %tmp6740 = getelementptr inbounds float* %tmp6739, i64 1
+  %tmp6741 = getelementptr inbounds float* %tmp6740, i64 1
+  %tmp6742 = getelementptr inbounds float* %tmp6741, i64 1
+  %tmp6743 = getelementptr inbounds float* %tmp6742, i64 1
+  %tmp6744 = getelementptr inbounds float* %tmp6743, i64 1
+  %tmp6745 = getelementptr inbounds float* %tmp6744, i64 1
+  %tmp6746 = getelementptr inbounds float* %tmp6745, i64 1
+  %tmp6747 = getelementptr inbounds float* %tmp6746, i64 1
+  %tmp6748 = getelementptr inbounds float* %tmp6747, i64 1
+  %tmp6749 = getelementptr inbounds float* %tmp6748, i64 1
+  %tmp6750 = getelementptr inbounds float* %tmp6749, i64 1
+  %tmp6751 = getelementptr inbounds float* %tmp6750, i64 1
+  %tmp6752 = getelementptr inbounds float* %tmp6751, i64 1
+  %tmp6753 = getelementptr inbounds float* %tmp6752, i64 1
+  %tmp6754 = getelementptr inbounds float* %tmp6753, i64 1
+  %tmp6755 = getelementptr inbounds float* %tmp6754, i64 1
+  %tmp6756 = getelementptr inbounds float* %tmp6755, i64 1
+  %tmp6757 = getelementptr inbounds float* %tmp6756, i64 1
+  %tmp6758 = getelementptr inbounds float* %tmp6757, i64 1
+  %tmp6759 = getelementptr inbounds float* %tmp6758, i64 1
+  %tmp6760 = getelementptr inbounds float* %tmp6759, i64 1
+  %tmp6761 = getelementptr inbounds float* %tmp6760, i64 1
+  %tmp6762 = getelementptr inbounds float* %tmp6761, i64 1
+  %tmp6763 = getelementptr inbounds float* %tmp6762, i64 1
+  %tmp6764 = getelementptr inbounds float* %tmp6763, i64 1
+  %tmp6765 = getelementptr inbounds float* %tmp6764, i64 1
+  %tmp6766 = getelementptr inbounds float* %tmp6765, i64 1
+  %tmp6767 = getelementptr inbounds float* %tmp6766, i64 1
+  %tmp6768 = getelementptr inbounds float* %tmp6767, i64 1
+  %tmp6769 = getelementptr inbounds float* %tmp6768, i64 1
+  %tmp6770 = getelementptr inbounds float* %tmp6769, i64 1
+  %tmp6771 = getelementptr inbounds float* %tmp6770, i64 1
+  %tmp6772 = getelementptr inbounds float* %tmp6771, i64 1
+  %tmp6773 = getelementptr inbounds float* %tmp6772, i64 1
+  %tmp6774 = getelementptr inbounds float* %tmp6773, i64 1
+  %tmp6775 = getelementptr inbounds float* %tmp6774, i64 1
+  %tmp6776 = getelementptr inbounds float* %tmp6775, i64 1
+  %tmp6777 = getelementptr inbounds float* %tmp6776, i64 1
+  %tmp6778 = getelementptr inbounds float* %tmp6777, i64 1
+  %tmp6779 = getelementptr inbounds float* %tmp6778, i64 1
+  %tmp6780 = getelementptr inbounds float* %tmp6779, i64 1
+  %tmp6781 = getelementptr inbounds float* %tmp6780, i64 1
+  %tmp6782 = getelementptr inbounds float* %tmp6781, i64 1
+  %tmp6783 = getelementptr inbounds float* %tmp6782, i64 1
+  %tmp6784 = getelementptr inbounds float* %tmp6783, i64 1
+  %tmp6785 = getelementptr inbounds float* %tmp6784, i64 1
+  %tmp6786 = getelementptr inbounds float* %tmp6785, i64 1
+  %tmp6787 = getelementptr inbounds float* %tmp6786, i64 1
+  %tmp6788 = getelementptr inbounds float* %tmp6787, i64 1
+  %tmp6789 = getelementptr inbounds float* %tmp6788, i64 1
+  %tmp6790 = getelementptr inbounds float* %tmp6789, i64 1
+  %tmp6791 = getelementptr inbounds float* %tmp6790, i64 1
+  %tmp6792 = getelementptr inbounds float* %tmp6791, i64 1
+  %tmp6793 = getelementptr inbounds float* %tmp6792, i64 1
+  %tmp6794 = getelementptr inbounds float* %tmp6793, i64 1
+  %tmp6795 = getelementptr inbounds float* %tmp6794, i64 1
+  %tmp6796 = getelementptr inbounds float* %tmp6795, i64 1
+  %tmp6797 = getelementptr inbounds float* %tmp6796, i64 1
+  %tmp6798 = getelementptr inbounds float* %tmp6797, i64 1
+  %tmp6799 = getelementptr inbounds float* %tmp6798, i64 1
+  %tmp6800 = getelementptr inbounds float* %tmp6799, i64 1
+  %tmp6801 = getelementptr inbounds float* %tmp6800, i64 1
+  %tmp6802 = getelementptr inbounds float* %tmp6801, i64 1
+  %tmp6803 = getelementptr inbounds float* %tmp6802, i64 1
+  %tmp6804 = getelementptr inbounds float* %tmp6803, i64 1
+  %tmp6805 = getelementptr inbounds float* %tmp6804, i64 1
+  %tmp6806 = getelementptr inbounds float* %tmp6805, i64 1
+  %tmp6807 = getelementptr inbounds float* %tmp6806, i64 1
+  %tmp6808 = getelementptr inbounds float* %tmp6807, i64 1
+  %tmp6809 = getelementptr inbounds float* %tmp6808, i64 1
+  %tmp6810 = getelementptr inbounds float* %tmp6809, i64 1
+  %tmp6811 = getelementptr inbounds float* %tmp6810, i64 1
+  %tmp6812 = getelementptr inbounds float* %tmp6811, i64 1
+  %tmp6813 = getelementptr inbounds float* %tmp6812, i64 1
+  %tmp6814 = getelementptr inbounds float* %tmp6813, i64 1
+  %tmp6815 = getelementptr inbounds float* %tmp6814, i64 1
+  %tmp6816 = getelementptr inbounds float* %tmp6815, i64 1
+  %tmp6817 = getelementptr inbounds float* %tmp6816, i64 1
+  %tmp6818 = getelementptr inbounds float* %tmp6817, i64 1
+  %tmp6819 = getelementptr inbounds float* %tmp6818, i64 1
+  %tmp6820 = getelementptr inbounds float* %tmp6819, i64 1
+  %tmp6821 = getelementptr inbounds float* %tmp6820, i64 1
+  %tmp6822 = getelementptr inbounds float* %tmp6821, i64 1
+  %tmp6823 = getelementptr inbounds float* %tmp6822, i64 1
+  %tmp6824 = getelementptr inbounds float* %tmp6823, i64 1
+  %tmp6825 = getelementptr inbounds float* %tmp6824, i64 1
+  %tmp6826 = getelementptr inbounds float* %tmp6825, i64 1
+  %tmp6827 = getelementptr inbounds float* %tmp6826, i64 1
+  %tmp6828 = getelementptr inbounds float* %tmp6827, i64 1
+  %tmp6829 = getelementptr inbounds float* %tmp6828, i64 1
+  %tmp6830 = getelementptr inbounds float* %tmp6829, i64 1
+  %tmp6831 = getelementptr inbounds float* %tmp6830, i64 1
+  %tmp6832 = getelementptr inbounds float* %tmp6831, i64 1
+  %tmp6833 = getelementptr inbounds float* %tmp6832, i64 1
+  %tmp6834 = getelementptr inbounds float* %tmp6833, i64 1
+  %tmp6835 = getelementptr inbounds float* %tmp6834, i64 1
+  %tmp6836 = getelementptr inbounds float* %tmp6835, i64 1
+  %tmp6837 = getelementptr inbounds float* %tmp6836, i64 1
+  %tmp6838 = getelementptr inbounds float* %tmp6837, i64 1
+  %tmp6839 = getelementptr inbounds float* %tmp6838, i64 1
+  %tmp6840 = getelementptr inbounds float* %tmp6839, i64 1
+  %tmp6841 = getelementptr inbounds float* %tmp6840, i64 1
+  %tmp6842 = getelementptr inbounds float* %tmp6841, i64 1
+  %tmp6843 = getelementptr inbounds float* %tmp6842, i64 1
+  %tmp6844 = getelementptr inbounds float* %tmp6843, i64 1
+  %tmp6845 = getelementptr inbounds float* %tmp6844, i64 1
+  %tmp6846 = getelementptr inbounds float* %tmp6845, i64 1
+  %tmp6847 = getelementptr inbounds float* %tmp6846, i64 1
+  %tmp6848 = getelementptr inbounds float* %tmp6847, i64 1
+  %tmp6849 = getelementptr inbounds float* %tmp6848, i64 1
+  %tmp6850 = getelementptr inbounds float* %tmp6849, i64 1
+  %tmp6851 = getelementptr inbounds float* %tmp6850, i64 1
+  %tmp6852 = getelementptr inbounds float* %tmp6851, i64 1
+  %tmp6853 = getelementptr inbounds float* %tmp6852, i64 1
+  %tmp6854 = getelementptr inbounds float* %tmp6853, i64 1
+  %tmp6855 = getelementptr inbounds float* %tmp6854, i64 1
+  %tmp6856 = getelementptr inbounds float* %tmp6855, i64 1
+  %tmp6857 = getelementptr inbounds float* %tmp6856, i64 1
+  %tmp6858 = getelementptr inbounds float* %tmp6857, i64 1
+  %tmp6859 = getelementptr inbounds float* %tmp6858, i64 1
+  %tmp6860 = getelementptr inbounds float* %tmp6859, i64 1
+  %tmp6861 = getelementptr inbounds float* %tmp6860, i64 1
+  %tmp6862 = getelementptr inbounds float* %tmp6861, i64 1
+  %tmp6863 = getelementptr inbounds float* %tmp6862, i64 1
+  %tmp6864 = getelementptr inbounds float* %tmp6863, i64 1
+  %tmp6865 = getelementptr inbounds float* %tmp6864, i64 1
+  %tmp6866 = getelementptr inbounds float* %tmp6865, i64 1
+  %tmp6867 = getelementptr inbounds float* %tmp6866, i64 1
+  %tmp6868 = getelementptr inbounds float* %tmp6867, i64 1
+  %tmp6869 = getelementptr inbounds float* %tmp6868, i64 1
+  %tmp6870 = getelementptr inbounds float* %tmp6869, i64 1
+  %tmp6871 = getelementptr inbounds float* %tmp6870, i64 1
+  %tmp6872 = getelementptr inbounds float* %tmp6871, i64 1
+  %tmp6873 = getelementptr inbounds float* %tmp6872, i64 1
+  %tmp6874 = getelementptr inbounds float* %tmp6873, i64 1
+  %tmp6875 = getelementptr inbounds float* %tmp6874, i64 1
+  %tmp6876 = getelementptr inbounds float* %tmp6875, i64 1
+  %tmp6877 = getelementptr inbounds float* %tmp6876, i64 1
+  %tmp6878 = getelementptr inbounds float* %tmp6877, i64 1
+  %tmp6879 = getelementptr inbounds float* %tmp6878, i64 1
+  %tmp6880 = getelementptr inbounds float* %tmp6879, i64 1
+  %tmp6881 = getelementptr inbounds float* %tmp6880, i64 1
+  %tmp6882 = getelementptr inbounds float* %tmp6881, i64 1
+  %tmp6883 = getelementptr inbounds float* %tmp6882, i64 1
+  %tmp6884 = getelementptr inbounds float* %tmp6883, i64 1
+  %tmp6885 = getelementptr inbounds float* %tmp6884, i64 1
+  %tmp6886 = getelementptr inbounds float* %tmp6885, i64 1
+  %tmp6887 = getelementptr inbounds float* %tmp6886, i64 1
+  %tmp6888 = getelementptr inbounds float* %tmp6887, i64 1
+  %tmp6889 = getelementptr inbounds float* %tmp6888, i64 1
+  %tmp6890 = getelementptr inbounds float* %tmp6889, i64 1
+  %tmp6891 = getelementptr inbounds float* %tmp6890, i64 1
+  %tmp6892 = getelementptr inbounds float* %tmp6891, i64 1
+  %tmp6893 = getelementptr inbounds float* %tmp6892, i64 1
+  %tmp6894 = getelementptr inbounds float* %tmp6893, i64 1
+  %tmp6895 = getelementptr inbounds float* %tmp6894, i64 1
+  %tmp6896 = getelementptr inbounds float* %tmp6895, i64 1
+  %tmp6897 = getelementptr inbounds float* %tmp6896, i64 1
+  %tmp6898 = getelementptr inbounds float* %tmp6897, i64 1
+  %tmp6899 = getelementptr inbounds float* %tmp6898, i64 1
+  %tmp6900 = getelementptr inbounds float* %tmp6899, i64 1
+  %tmp6901 = getelementptr inbounds float* %tmp6900, i64 1
+  %tmp6902 = getelementptr inbounds float* %tmp6901, i64 1
+  %tmp6903 = getelementptr inbounds float* %tmp6902, i64 1
+  %tmp6904 = getelementptr inbounds float* %tmp6903, i64 1
+  %tmp6905 = getelementptr inbounds float* %tmp6904, i64 1
+  %tmp6906 = getelementptr inbounds float* %tmp6905, i64 1
+  %tmp6907 = getelementptr inbounds float* %tmp6906, i64 1
+  %tmp6908 = getelementptr inbounds float* %tmp6907, i64 1
+  %tmp6909 = getelementptr inbounds float* %tmp6908, i64 1
+  %tmp6910 = getelementptr inbounds float* %tmp6909, i64 1
+  %tmp6911 = getelementptr inbounds float* %tmp6910, i64 1
+  %tmp6912 = getelementptr inbounds float* %tmp6911, i64 1
+  %tmp6913 = getelementptr inbounds float* %tmp6912, i64 1
+  %tmp6914 = getelementptr inbounds float* %tmp6913, i64 1
+  %tmp6915 = getelementptr inbounds float* %tmp6914, i64 1
+  %tmp6916 = getelementptr inbounds float* %tmp6915, i64 1
+  %tmp6917 = getelementptr inbounds float* %tmp6916, i64 1
+  %tmp6918 = getelementptr inbounds float* %tmp6917, i64 1
+  %tmp6919 = getelementptr inbounds float* %tmp6918, i64 1
+  %tmp6920 = getelementptr inbounds float* %tmp6919, i64 1
+  %tmp6921 = getelementptr inbounds float* %tmp6920, i64 1
+  %tmp6922 = getelementptr inbounds float* %tmp6921, i64 1
+  %tmp6923 = getelementptr inbounds float* %tmp6922, i64 1
+  %tmp6924 = getelementptr inbounds float* %tmp6923, i64 1
+  %tmp6925 = getelementptr inbounds float* %tmp6924, i64 1
+  %tmp6926 = getelementptr inbounds float* %tmp6925, i64 1
+  %tmp6927 = getelementptr inbounds float* %tmp6926, i64 1
+  %tmp6928 = getelementptr inbounds float* %tmp6927, i64 1
+  %tmp6929 = getelementptr inbounds float* %tmp6928, i64 1
+  %tmp6930 = getelementptr inbounds float* %tmp6929, i64 1
+  %tmp6931 = getelementptr inbounds float* %tmp6930, i64 1
+  %tmp6932 = getelementptr inbounds float* %tmp6931, i64 1
+  %tmp6933 = getelementptr inbounds float* %tmp6932, i64 1
+  %tmp6934 = getelementptr inbounds float* %tmp6933, i64 1
+  %tmp6935 = getelementptr inbounds float* %tmp6934, i64 1
+  %tmp6936 = getelementptr inbounds float* %tmp6935, i64 1
+  %tmp6937 = getelementptr inbounds float* %tmp6936, i64 1
+  %tmp6938 = getelementptr inbounds float* %tmp6937, i64 1
+  %tmp6939 = getelementptr inbounds float* %tmp6938, i64 1
+  %tmp6940 = getelementptr inbounds float* %tmp6939, i64 1
+  %tmp6941 = getelementptr inbounds float* %tmp6940, i64 1
+  %tmp6942 = getelementptr inbounds float* %tmp6941, i64 1
+  %tmp6943 = getelementptr inbounds float* %tmp6942, i64 1
+  %tmp6944 = getelementptr inbounds float* %tmp6943, i64 1
+  %tmp6945 = getelementptr inbounds float* %tmp6944, i64 1
+  %tmp6946 = getelementptr inbounds float* %tmp6945, i64 1
+  %tmp6947 = getelementptr inbounds float* %tmp6946, i64 1
+  %tmp6948 = getelementptr inbounds float* %tmp6947, i64 1
+  %tmp6949 = getelementptr inbounds float* %tmp6948, i64 1
+  %tmp6950 = getelementptr inbounds float* %tmp6949, i64 1
+  %tmp6951 = getelementptr inbounds float* %tmp6950, i64 1
+  %tmp6952 = getelementptr inbounds float* %tmp6951, i64 1
+  %tmp6953 = getelementptr inbounds float* %tmp6952, i64 1
+  %tmp6954 = getelementptr inbounds float* %tmp6953, i64 1
+  %tmp6955 = getelementptr inbounds float* %tmp6954, i64 1
+  %tmp6956 = getelementptr inbounds float* %tmp6955, i64 1
+  %tmp6957 = getelementptr inbounds float* %tmp6956, i64 1
+  %tmp6958 = getelementptr inbounds float* %tmp6957, i64 1
+  %tmp6959 = getelementptr inbounds float* %tmp6958, i64 1
+  %tmp6960 = getelementptr inbounds float* %tmp6959, i64 1
+  %tmp6961 = getelementptr inbounds float* %tmp6960, i64 1
+  %tmp6962 = getelementptr inbounds float* %tmp6961, i64 1
+  %tmp6963 = getelementptr inbounds float* %tmp6962, i64 1
+  %tmp6964 = getelementptr inbounds float* %tmp6963, i64 1
+  %tmp6965 = getelementptr inbounds float* %tmp6964, i64 1
+  %tmp6966 = getelementptr inbounds float* %tmp6965, i64 1
+  %tmp6967 = getelementptr inbounds float* %tmp6966, i64 1
+  %tmp6968 = getelementptr inbounds float* %tmp6967, i64 1
+  %tmp6969 = getelementptr inbounds float* %tmp6968, i64 1
+  %tmp6970 = getelementptr inbounds float* %tmp6969, i64 1
+  %tmp6971 = getelementptr inbounds float* %tmp6970, i64 1
+  %tmp6972 = getelementptr inbounds float* %tmp6971, i64 1
+  %tmp6973 = getelementptr inbounds float* %tmp6972, i64 1
+  %tmp6974 = getelementptr inbounds float* %tmp6973, i64 1
+  %tmp6975 = getelementptr inbounds float* %tmp6974, i64 1
+  %tmp6976 = getelementptr inbounds float* %tmp6975, i64 1
+  %tmp6977 = getelementptr inbounds float* %tmp6976, i64 1
+  %tmp6978 = getelementptr inbounds float* %tmp6977, i64 1
+  %tmp6979 = getelementptr inbounds float* %tmp6978, i64 1
+  %tmp6980 = getelementptr inbounds float* %tmp6979, i64 1
+  %tmp6981 = getelementptr inbounds float* %tmp6980, i64 1
+  %tmp6982 = getelementptr inbounds float* %tmp6981, i64 1
+  %tmp6983 = getelementptr inbounds float* %tmp6982, i64 1
+  %tmp6984 = getelementptr inbounds float* %tmp6983, i64 1
+  %tmp6985 = getelementptr inbounds float* %tmp6984, i64 1
+  %tmp6986 = getelementptr inbounds float* %tmp6985, i64 1
+  %tmp6987 = getelementptr inbounds float* %tmp6986, i64 1
+  %tmp6988 = getelementptr inbounds float* %tmp6987, i64 1
+  %tmp6989 = getelementptr inbounds float* %tmp6988, i64 1
+  %tmp6990 = getelementptr inbounds float* %tmp6989, i64 1
+  %tmp6991 = getelementptr inbounds float* %tmp6990, i64 1
+  %tmp6992 = getelementptr inbounds float* %tmp6991, i64 1
+  %tmp6993 = getelementptr inbounds float* %tmp6992, i64 1
+  %tmp6994 = getelementptr inbounds float* %tmp6993, i64 1
+  %tmp6995 = getelementptr inbounds float* %tmp6994, i64 1
+  %tmp6996 = getelementptr inbounds float* %tmp6995, i64 1
+  %tmp6997 = getelementptr inbounds float* %tmp6996, i64 1
+  %tmp6998 = getelementptr inbounds float* %tmp6997, i64 1
+  %tmp6999 = getelementptr inbounds float* %tmp6998, i64 1
+  %tmp7000 = getelementptr inbounds float* %tmp6999, i64 1
+  %tmp7001 = getelementptr inbounds float* %tmp7000, i64 1
+  %tmp7002 = getelementptr inbounds float* %tmp7001, i64 1
+  %tmp7003 = getelementptr inbounds float* %tmp7002, i64 1
+  %tmp7004 = getelementptr inbounds float* %tmp7003, i64 1
+  %tmp7005 = getelementptr inbounds float* %tmp7004, i64 1
+  %tmp7006 = getelementptr inbounds float* %tmp7005, i64 1
+  %tmp7007 = getelementptr inbounds float* %tmp7006, i64 1
+  %tmp7008 = getelementptr inbounds float* %tmp7007, i64 1
+  %tmp7009 = getelementptr inbounds float* %tmp7008, i64 1
+  %tmp7010 = getelementptr inbounds float* %tmp7009, i64 1
+  %tmp7011 = getelementptr inbounds float* %tmp7010, i64 1
+  %tmp7012 = getelementptr inbounds float* %tmp7011, i64 1
+  %tmp7013 = getelementptr inbounds float* %tmp7012, i64 1
+  %tmp7014 = getelementptr inbounds float* %tmp7013, i64 1
+  %tmp7015 = getelementptr inbounds float* %tmp7014, i64 1
+  %tmp7016 = getelementptr inbounds float* %tmp7015, i64 1
+  %tmp7017 = getelementptr inbounds float* %tmp7016, i64 1
+  %tmp7018 = getelementptr inbounds float* %tmp7017, i64 1
+  %tmp7019 = getelementptr inbounds float* %tmp7018, i64 1
+  %tmp7020 = getelementptr inbounds float* %tmp7019, i64 1
+  %tmp7021 = getelementptr inbounds float* %tmp7020, i64 1
+  %tmp7022 = getelementptr inbounds float* %tmp7021, i64 1
+  %tmp7023 = getelementptr inbounds float* %tmp7022, i64 1
+  %tmp7024 = getelementptr inbounds float* %tmp7023, i64 1
+  %tmp7025 = getelementptr inbounds float* %tmp7024, i64 1
+  %tmp7026 = getelementptr inbounds float* %tmp7025, i64 1
+  %tmp7027 = getelementptr inbounds float* %tmp7026, i64 1
+  %tmp7028 = getelementptr inbounds float* %tmp7027, i64 1
+  %tmp7029 = getelementptr inbounds float* %tmp7028, i64 1
+  %tmp7030 = getelementptr inbounds float* %tmp7029, i64 1
+  %tmp7031 = getelementptr inbounds float* %tmp7030, i64 1
+  %tmp7032 = getelementptr inbounds float* %tmp7031, i64 1
+  %tmp7033 = getelementptr inbounds float* %tmp7032, i64 1
+  %tmp7034 = getelementptr inbounds float* %tmp7033, i64 1
+  %tmp7035 = getelementptr inbounds float* %tmp7034, i64 1
+  %tmp7036 = getelementptr inbounds float* %tmp7035, i64 1
+  %tmp7037 = getelementptr inbounds float* %tmp7036, i64 1
+  %tmp7038 = getelementptr inbounds float* %tmp7037, i64 1
+  %tmp7039 = getelementptr inbounds float* %tmp7038, i64 1
+  %tmp7040 = getelementptr inbounds float* %tmp7039, i64 1
+  %tmp7041 = getelementptr inbounds float* %tmp7040, i64 1
+  %tmp7042 = getelementptr inbounds float* %tmp7041, i64 1
+  %tmp7043 = getelementptr inbounds float* %tmp7042, i64 1
+  %tmp7044 = getelementptr inbounds float* %tmp7043, i64 1
+  %tmp7045 = getelementptr inbounds float* %tmp7044, i64 1
+  %tmp7046 = getelementptr inbounds float* %tmp7045, i64 1
+  %tmp7047 = getelementptr inbounds float* %tmp7046, i64 1
+  %tmp7048 = getelementptr inbounds float* %tmp7047, i64 1
+  %tmp7049 = getelementptr inbounds float* %tmp7048, i64 1
+  %tmp7050 = getelementptr inbounds float* %tmp7049, i64 1
+  %tmp7051 = getelementptr inbounds float* %tmp7050, i64 1
+  %tmp7052 = getelementptr inbounds float* %tmp7051, i64 1
+  %tmp7053 = getelementptr inbounds float* %tmp7052, i64 1
+  %tmp7054 = getelementptr inbounds float* %tmp7053, i64 1
+  %tmp7055 = getelementptr inbounds float* %tmp7054, i64 1
+  %tmp7056 = getelementptr inbounds float* %tmp7055, i64 1
+  %tmp7057 = getelementptr inbounds float* %tmp7056, i64 1
+  %tmp7058 = getelementptr inbounds float* %tmp7057, i64 1
+  %tmp7059 = getelementptr inbounds float* %tmp7058, i64 1
+  %tmp7060 = getelementptr inbounds float* %tmp7059, i64 1
+  %tmp7061 = getelementptr inbounds float* %tmp7060, i64 1
+  %tmp7062 = getelementptr inbounds float* %tmp7061, i64 1
+  %tmp7063 = getelementptr inbounds float* %tmp7062, i64 1
+  %tmp7064 = getelementptr inbounds float* %tmp7063, i64 1
+  %tmp7065 = getelementptr inbounds float* %tmp7064, i64 1
+  %tmp7066 = getelementptr inbounds float* %tmp7065, i64 1
+  %tmp7067 = getelementptr inbounds float* %tmp7066, i64 1
+  %tmp7068 = getelementptr inbounds float* %tmp7067, i64 1
+  %tmp7069 = getelementptr inbounds float* %tmp7068, i64 1
+  %tmp7070 = getelementptr inbounds float* %tmp7069, i64 1
+  %tmp7071 = getelementptr inbounds float* %tmp7070, i64 1
+  %tmp7072 = getelementptr inbounds float* %tmp7071, i64 1
+  %tmp7073 = getelementptr inbounds float* %tmp7072, i64 1
+  %tmp7074 = getelementptr inbounds float* %tmp7073, i64 1
+  %tmp7075 = getelementptr inbounds float* %tmp7074, i64 1
+  %tmp7076 = getelementptr inbounds float* %tmp7075, i64 1
+  %tmp7077 = getelementptr inbounds float* %tmp7076, i64 1
+  %tmp7078 = getelementptr inbounds float* %tmp7077, i64 1
+  %tmp7079 = getelementptr inbounds float* %tmp7078, i64 1
+  %tmp7080 = getelementptr inbounds float* %tmp7079, i64 1
+  %tmp7081 = getelementptr inbounds float* %tmp7080, i64 1
+  %tmp7082 = getelementptr inbounds float* %tmp7081, i64 1
+  %tmp7083 = getelementptr inbounds float* %tmp7082, i64 1
+  %tmp7084 = getelementptr inbounds float* %tmp7083, i64 1
+  %tmp7085 = getelementptr inbounds float* %tmp7084, i64 1
+  %tmp7086 = getelementptr inbounds float* %tmp7085, i64 1
+  %tmp7087 = getelementptr inbounds float* %tmp7086, i64 1
+  %tmp7088 = getelementptr inbounds float* %tmp7087, i64 1
+  %tmp7089 = getelementptr inbounds float* %tmp7088, i64 1
+  %tmp7090 = getelementptr inbounds float* %tmp7089, i64 1
+  %tmp7091 = getelementptr inbounds float* %tmp7090, i64 1
+  %tmp7092 = getelementptr inbounds float* %tmp7091, i64 1
+  %tmp7093 = getelementptr inbounds float* %tmp7092, i64 1
+  %tmp7094 = getelementptr inbounds float* %tmp7093, i64 1
+  %tmp7095 = getelementptr inbounds float* %tmp7094, i64 1
+  %tmp7096 = getelementptr inbounds float* %tmp7095, i64 1
+  %tmp7097 = getelementptr inbounds float* %tmp7096, i64 1
+  %tmp7098 = getelementptr inbounds float* %tmp7097, i64 1
+  %tmp7099 = getelementptr inbounds float* %tmp7098, i64 1
+  %tmp7100 = getelementptr inbounds float* %tmp7099, i64 1
+  %tmp7101 = getelementptr inbounds float* %tmp7100, i64 1
+  %tmp7102 = getelementptr inbounds float* %tmp7101, i64 1
+  %tmp7103 = getelementptr inbounds float* %tmp7102, i64 1
+  %tmp7104 = getelementptr inbounds float* %tmp7103, i64 1
+  %tmp7105 = getelementptr inbounds float* %tmp7104, i64 1
+  %tmp7106 = getelementptr inbounds float* %tmp7105, i64 1
+  %tmp7107 = getelementptr inbounds float* %tmp7106, i64 1
+  %tmp7108 = getelementptr inbounds float* %tmp7107, i64 1
+  %tmp7109 = getelementptr inbounds float* %tmp7108, i64 1
+  %tmp7110 = getelementptr inbounds float* %tmp7109, i64 1
+  %tmp7111 = getelementptr inbounds float* %tmp7110, i64 1
+  %tmp7112 = getelementptr inbounds float* %tmp7111, i64 1
+  %tmp7113 = getelementptr inbounds float* %tmp7112, i64 1
+  %tmp7114 = getelementptr inbounds float* %tmp7113, i64 1
+  %tmp7115 = getelementptr inbounds float* %tmp7114, i64 1
+  %tmp7116 = getelementptr inbounds float* %tmp7115, i64 1
+  %tmp7117 = getelementptr inbounds float* %tmp7116, i64 1
+  %tmp7118 = getelementptr inbounds float* %tmp7117, i64 1
+  %tmp7119 = getelementptr inbounds float* %tmp7118, i64 1
+  %tmp7120 = getelementptr inbounds float* %tmp7119, i64 1
+  %tmp7121 = getelementptr inbounds float* %tmp7120, i64 1
+  %tmp7122 = getelementptr inbounds float* %tmp7121, i64 1
+  %tmp7123 = getelementptr inbounds float* %tmp7122, i64 1
+  %tmp7124 = getelementptr inbounds float* %tmp7123, i64 1
+  %tmp7125 = getelementptr inbounds float* %tmp7124, i64 1
+  %tmp7126 = getelementptr inbounds float* %tmp7125, i64 1
+  %tmp7127 = getelementptr inbounds float* %tmp7126, i64 1
+  %tmp7128 = getelementptr inbounds float* %tmp7127, i64 1
+  %tmp7129 = getelementptr inbounds float* %tmp7128, i64 1
+  %tmp7130 = getelementptr inbounds float* %tmp7129, i64 1
+  %tmp7131 = getelementptr inbounds float* %tmp7130, i64 1
+  %tmp7132 = getelementptr inbounds float* %tmp7131, i64 1
+  %tmp7133 = getelementptr inbounds float* %tmp7132, i64 1
+  %tmp7134 = getelementptr inbounds float* %tmp7133, i64 1
+  %tmp7135 = getelementptr inbounds float* %tmp7134, i64 1
+  %tmp7136 = getelementptr inbounds float* %tmp7135, i64 1
+  %tmp7137 = getelementptr inbounds float* %tmp7136, i64 1
+  %tmp7138 = getelementptr inbounds float* %tmp7137, i64 1
+  %tmp7139 = getelementptr inbounds float* %tmp7138, i64 1
+  %tmp7140 = getelementptr inbounds float* %tmp7139, i64 1
+  %tmp7141 = getelementptr inbounds float* %tmp7140, i64 1
+  %tmp7142 = getelementptr inbounds float* %tmp7141, i64 1
+  %tmp7143 = getelementptr inbounds float* %tmp7142, i64 1
+  %tmp7144 = getelementptr inbounds float* %tmp7143, i64 1
+  %tmp7145 = getelementptr inbounds float* %tmp7144, i64 1
+  %tmp7146 = getelementptr inbounds float* %tmp7145, i64 1
+  %tmp7147 = getelementptr inbounds float* %tmp7146, i64 1
+  %tmp7148 = getelementptr inbounds float* %tmp7147, i64 1
+  %tmp7149 = getelementptr inbounds float* %tmp7148, i64 1
+  %tmp7150 = getelementptr inbounds float* %tmp7149, i64 1
+  %tmp7151 = getelementptr inbounds float* %tmp7150, i64 1
+  %tmp7152 = getelementptr inbounds float* %tmp7151, i64 1
+  %tmp7153 = getelementptr inbounds float* %tmp7152, i64 1
+  %tmp7154 = getelementptr inbounds float* %tmp7153, i64 1
+  %tmp7155 = getelementptr inbounds float* %tmp7154, i64 1
+  %tmp7156 = getelementptr inbounds float* %tmp7155, i64 1
+  %tmp7157 = getelementptr inbounds float* %tmp7156, i64 1
+  %tmp7158 = getelementptr inbounds float* %tmp7157, i64 1
+  %tmp7159 = getelementptr inbounds float* %tmp7158, i64 1
+  %tmp7160 = getelementptr inbounds float* %tmp7159, i64 1
+  %tmp7161 = getelementptr inbounds float* %tmp7160, i64 1
+  %tmp7162 = getelementptr inbounds float* %tmp7161, i64 1
+  %tmp7163 = getelementptr inbounds float* %tmp7162, i64 1
+  %tmp7164 = getelementptr inbounds float* %tmp7163, i64 1
+  %tmp7165 = getelementptr inbounds float* %tmp7164, i64 1
+  %tmp7166 = getelementptr inbounds float* %tmp7165, i64 1
+  %tmp7167 = getelementptr inbounds float* %tmp7166, i64 1
+  %tmp7168 = getelementptr inbounds float* %tmp7167, i64 1
+  %tmp7169 = getelementptr inbounds float* %tmp7168, i64 1
+  %tmp7170 = getelementptr inbounds float* %tmp7169, i64 1
+  %tmp7171 = getelementptr inbounds float* %tmp7170, i64 1
+  %tmp7172 = getelementptr inbounds float* %tmp7171, i64 1
+  %tmp7173 = getelementptr inbounds float* %tmp7172, i64 1
+  %tmp7174 = getelementptr inbounds float* %tmp7173, i64 1
+  %tmp7175 = getelementptr inbounds float* %tmp7174, i64 1
+  %tmp7176 = getelementptr inbounds float* %tmp7175, i64 1
+  %tmp7177 = getelementptr inbounds float* %tmp7176, i64 1
+  %tmp7178 = getelementptr inbounds float* %tmp7177, i64 1
+  %tmp7179 = getelementptr inbounds float* %tmp7178, i64 1
+  %tmp7180 = getelementptr inbounds float* %tmp7179, i64 1
+  %tmp7181 = getelementptr inbounds float* %tmp7180, i64 1
+  %tmp7182 = getelementptr inbounds float* %tmp7181, i64 1
+  %tmp7183 = getelementptr inbounds float* %tmp7182, i64 1
+  %tmp7184 = getelementptr inbounds float* %tmp7183, i64 1
+  %tmp7185 = getelementptr inbounds float* %tmp7184, i64 1
+  %tmp7186 = getelementptr inbounds float* %tmp7185, i64 1
+  %tmp7187 = getelementptr inbounds float* %tmp7186, i64 1
+  %tmp7188 = getelementptr inbounds float* %tmp7187, i64 1
+  %tmp7189 = getelementptr inbounds float* %tmp7188, i64 1
+  %tmp7190 = getelementptr inbounds float* %tmp7189, i64 1
+  %tmp7191 = getelementptr inbounds float* %tmp7190, i64 1
+  %tmp7192 = getelementptr inbounds float* %tmp7191, i64 1
+  %tmp7193 = getelementptr inbounds float* %tmp7192, i64 1
+  %tmp7194 = getelementptr inbounds float* %tmp7193, i64 1
+  %tmp7195 = getelementptr inbounds float* %tmp7194, i64 1
+  %tmp7196 = getelementptr inbounds float* %tmp7195, i64 1
+  %tmp7197 = getelementptr inbounds float* %tmp7196, i64 1
+  %tmp7198 = getelementptr inbounds float* %tmp7197, i64 1
+  %tmp7199 = getelementptr inbounds float* %tmp7198, i64 1
+  %tmp7200 = getelementptr inbounds float* %tmp7199, i64 1
+  %tmp7201 = getelementptr inbounds float* %tmp7200, i64 1
+  %tmp7202 = getelementptr inbounds float* %tmp7201, i64 1
+  %tmp7203 = getelementptr inbounds float* %tmp7202, i64 1
+  %tmp7204 = getelementptr inbounds float* %tmp7203, i64 1
+  %tmp7205 = getelementptr inbounds float* %tmp7204, i64 1
+  %tmp7206 = getelementptr inbounds float* %tmp7205, i64 1
+  %tmp7207 = getelementptr inbounds float* %tmp7206, i64 1
+  %tmp7208 = getelementptr inbounds float* %tmp7207, i64 1
+  %tmp7209 = getelementptr inbounds float* %tmp7208, i64 1
+  %tmp7210 = getelementptr inbounds float* %tmp7209, i64 1
+  %tmp7211 = getelementptr inbounds float* %tmp7210, i64 1
+  %tmp7212 = getelementptr inbounds float* %tmp7211, i64 1
+  %tmp7213 = getelementptr inbounds float* %tmp7212, i64 1
+  %tmp7214 = getelementptr inbounds float* %tmp7213, i64 1
+  %tmp7215 = getelementptr inbounds float* %tmp7214, i64 1
+  %tmp7216 = getelementptr inbounds float* %tmp7215, i64 1
+  %tmp7217 = getelementptr inbounds float* %tmp7216, i64 1
+  %tmp7218 = getelementptr inbounds float* %tmp7217, i64 1
+  %tmp7219 = getelementptr inbounds float* %tmp7218, i64 1
+  %tmp7220 = getelementptr inbounds float* %tmp7219, i64 1
+  %tmp7221 = getelementptr inbounds float* %tmp7220, i64 1
+  %tmp7222 = getelementptr inbounds float* %tmp7221, i64 1
+  %tmp7223 = getelementptr inbounds float* %tmp7222, i64 1
+  %tmp7224 = getelementptr inbounds float* %tmp7223, i64 1
+  %tmp7225 = getelementptr inbounds float* %tmp7224, i64 1
+  %tmp7226 = getelementptr inbounds float* %tmp7225, i64 1
+  %tmp7227 = getelementptr inbounds float* %tmp7226, i64 1
+  %tmp7228 = getelementptr inbounds float* %tmp7227, i64 1
+  %tmp7229 = getelementptr inbounds float* %tmp7228, i64 1
+  %tmp7230 = getelementptr inbounds float* %tmp7229, i64 1
+  %tmp7231 = getelementptr inbounds float* %tmp7230, i64 1
+  %tmp7232 = getelementptr inbounds float* %tmp7231, i64 1
+  %tmp7233 = getelementptr inbounds float* %tmp7232, i64 1
+  %tmp7234 = getelementptr inbounds float* %tmp7233, i64 1
+  %tmp7235 = getelementptr inbounds float* %tmp7234, i64 1
+  %tmp7236 = getelementptr inbounds float* %tmp7235, i64 1
+  %tmp7237 = getelementptr inbounds float* %tmp7236, i64 1
+  %tmp7238 = getelementptr inbounds float* %tmp7237, i64 1
+  %tmp7239 = getelementptr inbounds float* %tmp7238, i64 1
+  %tmp7240 = getelementptr inbounds float* %tmp7239, i64 1
+  %tmp7241 = getelementptr inbounds float* %tmp7240, i64 1
+  %tmp7242 = getelementptr inbounds float* %tmp7241, i64 1
+  %tmp7243 = getelementptr inbounds float* %tmp7242, i64 1
+  %tmp7244 = getelementptr inbounds float* %tmp7243, i64 1
+  %tmp7245 = getelementptr inbounds float* %tmp7244, i64 1
+  %tmp7246 = getelementptr inbounds float* %tmp7245, i64 1
+  %tmp7247 = getelementptr inbounds float* %tmp7246, i64 1
+  %tmp7248 = getelementptr inbounds float* %tmp7247, i64 1
+  %tmp7249 = getelementptr inbounds float* %tmp7248, i64 1
+  %tmp7250 = getelementptr inbounds float* %tmp7249, i64 1
+  %tmp7251 = getelementptr inbounds float* %tmp7250, i64 1
+  %tmp7252 = getelementptr inbounds float* %tmp7251, i64 1
+  %tmp7253 = getelementptr inbounds float* %tmp7252, i64 1
+  %tmp7254 = getelementptr inbounds float* %tmp7253, i64 1
+  %tmp7255 = getelementptr inbounds float* %tmp7254, i64 1
+  %tmp7256 = getelementptr inbounds float* %tmp7255, i64 1
+  %tmp7257 = getelementptr inbounds float* %tmp7256, i64 1
+  %tmp7258 = getelementptr inbounds float* %tmp7257, i64 1
+  %tmp7259 = getelementptr inbounds float* %tmp7258, i64 1
+  %tmp7260 = getelementptr inbounds float* %tmp7259, i64 1
+  %tmp7261 = getelementptr inbounds float* %tmp7260, i64 1
+  %tmp7262 = getelementptr inbounds float* %tmp7261, i64 1
+  %tmp7263 = getelementptr inbounds float* %tmp7262, i64 1
+  %tmp7264 = getelementptr inbounds float* %tmp7263, i64 1
+  %tmp7265 = getelementptr inbounds float* %tmp7264, i64 1
+  %tmp7266 = getelementptr inbounds float* %tmp7265, i64 1
+  %tmp7267 = getelementptr inbounds float* %tmp7266, i64 1
+  %tmp7268 = getelementptr inbounds float* %tmp7267, i64 1
+  %tmp7269 = getelementptr inbounds float* %tmp7268, i64 1
+  %tmp7270 = getelementptr inbounds float* %tmp7269, i64 1
+  %tmp7271 = getelementptr inbounds float* %tmp7270, i64 1
+  %tmp7272 = getelementptr inbounds float* %tmp7271, i64 1
+  %tmp7273 = getelementptr inbounds float* %tmp7272, i64 1
+  %tmp7274 = getelementptr inbounds float* %tmp7273, i64 1
+  %tmp7275 = getelementptr inbounds float* %tmp7274, i64 1
+  %tmp7276 = getelementptr inbounds float* %tmp7275, i64 1
+  %tmp7277 = getelementptr inbounds float* %tmp7276, i64 1
+  %tmp7278 = getelementptr inbounds float* %tmp7277, i64 1
+  %tmp7279 = getelementptr inbounds float* %tmp7278, i64 1
+  %tmp7280 = getelementptr inbounds float* %tmp7279, i64 1
+  %tmp7281 = getelementptr inbounds float* %tmp7280, i64 1
+  %tmp7282 = getelementptr inbounds float* %tmp7281, i64 1
+  %tmp7283 = getelementptr inbounds float* %tmp7282, i64 1
+  %tmp7284 = getelementptr inbounds float* %tmp7283, i64 1
+  %tmp7285 = getelementptr inbounds float* %tmp7284, i64 1
+  %tmp7286 = getelementptr inbounds float* %tmp7285, i64 1
+  %tmp7287 = getelementptr inbounds float* %tmp7286, i64 1
+  %tmp7288 = getelementptr inbounds float* %tmp7287, i64 1
+  %tmp7289 = getelementptr inbounds float* %tmp7288, i64 1
+  %tmp7290 = getelementptr inbounds float* %tmp7289, i64 1
+  %tmp7291 = getelementptr inbounds float* %tmp7290, i64 1
+  %tmp7292 = getelementptr inbounds float* %tmp7291, i64 1
+  %tmp7293 = getelementptr inbounds float* %tmp7292, i64 1
+  %tmp7294 = getelementptr inbounds float* %tmp7293, i64 1
+  %tmp7295 = getelementptr inbounds float* %tmp7294, i64 1
+  %tmp7296 = getelementptr inbounds float* %tmp7295, i64 1
+  %tmp7297 = getelementptr inbounds float* %tmp7296, i64 1
+  %tmp7298 = getelementptr inbounds float* %tmp7297, i64 1
+  %tmp7299 = getelementptr inbounds float* %tmp7298, i64 1
+  %tmp7300 = getelementptr inbounds float* %tmp7299, i64 1
+  %tmp7301 = getelementptr inbounds float* %tmp7300, i64 1
+  %tmp7302 = getelementptr inbounds float* %tmp7301, i64 1
+  %tmp7303 = getelementptr inbounds float* %tmp7302, i64 1
+  %tmp7304 = getelementptr inbounds float* %tmp7303, i64 1
+  %tmp7305 = getelementptr inbounds float* %tmp7304, i64 1
+  %tmp7306 = getelementptr inbounds float* %tmp7305, i64 1
+  %tmp7307 = getelementptr inbounds float* %tmp7306, i64 1
+  %tmp7308 = getelementptr inbounds float* %tmp7307, i64 1
+  %tmp7309 = getelementptr inbounds float* %tmp7308, i64 1
+  %tmp7310 = getelementptr inbounds float* %tmp7309, i64 1
+  %tmp7311 = getelementptr inbounds float* %tmp7310, i64 1
+  %tmp7312 = getelementptr inbounds float* %tmp7311, i64 1
+  %tmp7313 = getelementptr inbounds float* %tmp7312, i64 1
+  %tmp7314 = getelementptr inbounds float* %tmp7313, i64 1
+  %tmp7315 = getelementptr inbounds float* %tmp7314, i64 1
+  %tmp7316 = getelementptr inbounds float* %tmp7315, i64 1
+  %tmp7317 = getelementptr inbounds float* %tmp7316, i64 1
+  %tmp7318 = getelementptr inbounds float* %tmp7317, i64 1
+  %tmp7319 = getelementptr inbounds float* %tmp7318, i64 1
+  %tmp7320 = getelementptr inbounds float* %tmp7319, i64 1
+  %tmp7321 = getelementptr inbounds float* %tmp7320, i64 1
+  %tmp7322 = getelementptr inbounds float* %tmp7321, i64 1
+  %tmp7323 = getelementptr inbounds float* %tmp7322, i64 1
+  %tmp7324 = getelementptr inbounds float* %tmp7323, i64 1
+  %tmp7325 = getelementptr inbounds float* %tmp7324, i64 1
+  %tmp7326 = getelementptr inbounds float* %tmp7325, i64 1
+  %tmp7327 = getelementptr inbounds float* %tmp7326, i64 1
+  %tmp7328 = getelementptr inbounds float* %tmp7327, i64 1
+  %tmp7329 = getelementptr inbounds float* %tmp7328, i64 1
+  %tmp7330 = getelementptr inbounds float* %tmp7329, i64 1
+  %tmp7331 = getelementptr inbounds float* %tmp7330, i64 1
+  %tmp7332 = getelementptr inbounds float* %tmp7331, i64 1
+  %tmp7333 = getelementptr inbounds float* %tmp7332, i64 1
+  %tmp7334 = getelementptr inbounds float* %tmp7333, i64 1
+  %tmp7335 = getelementptr inbounds float* %tmp7334, i64 1
+  %tmp7336 = getelementptr inbounds float* %tmp7335, i64 1
+  %tmp7337 = getelementptr inbounds float* %tmp7336, i64 1
+  %tmp7338 = getelementptr inbounds float* %tmp7337, i64 1
+  %tmp7339 = getelementptr inbounds float* %tmp7338, i64 1
+  %tmp7340 = getelementptr inbounds float* %tmp7339, i64 1
+  %tmp7341 = getelementptr inbounds float* %tmp7340, i64 1
+  %tmp7342 = getelementptr inbounds float* %tmp7341, i64 1
+  %tmp7343 = getelementptr inbounds float* %tmp7342, i64 1
+  %tmp7344 = getelementptr inbounds float* %tmp7343, i64 1
+  %tmp7345 = getelementptr inbounds float* %tmp7344, i64 1
+  %tmp7346 = getelementptr inbounds float* %tmp7345, i64 1
+  %tmp7347 = getelementptr inbounds float* %tmp7346, i64 1
+  %tmp7348 = getelementptr inbounds float* %tmp7347, i64 1
+  %tmp7349 = getelementptr inbounds float* %tmp7348, i64 1
+  %tmp7350 = getelementptr inbounds float* %tmp7349, i64 1
+  %tmp7351 = getelementptr inbounds float* %tmp7350, i64 1
+  %tmp7352 = getelementptr inbounds float* %tmp7351, i64 1
+  %tmp7353 = getelementptr inbounds float* %tmp7352, i64 1
+  %tmp7354 = getelementptr inbounds float* %tmp7353, i64 1
+  %tmp7355 = getelementptr inbounds float* %tmp7354, i64 1
+  %tmp7356 = getelementptr inbounds float* %tmp7355, i64 1
+  %tmp7357 = getelementptr inbounds float* %tmp7356, i64 1
+  %tmp7358 = getelementptr inbounds float* %tmp7357, i64 1
+  %tmp7359 = getelementptr inbounds float* %tmp7358, i64 1
+  %tmp7360 = getelementptr inbounds float* %tmp7359, i64 1
+  %tmp7361 = getelementptr inbounds float* %tmp7360, i64 1
+  %tmp7362 = getelementptr inbounds float* %tmp7361, i64 1
+  %tmp7363 = getelementptr inbounds float* %tmp7362, i64 1
+  %tmp7364 = getelementptr inbounds float* %tmp7363, i64 1
+  %tmp7365 = getelementptr inbounds float* %tmp7364, i64 1
+  %tmp7366 = getelementptr inbounds float* %tmp7365, i64 1
+  %tmp7367 = getelementptr inbounds float* %tmp7366, i64 1
+  %tmp7368 = getelementptr inbounds float* %tmp7367, i64 1
+  %tmp7369 = getelementptr inbounds float* %tmp7368, i64 1
+  %tmp7370 = getelementptr inbounds float* %tmp7369, i64 1
+  %tmp7371 = getelementptr inbounds float* %tmp7370, i64 1
+  %tmp7372 = getelementptr inbounds float* %tmp7371, i64 1
+  %tmp7373 = getelementptr inbounds float* %tmp7372, i64 1
+  %tmp7374 = getelementptr inbounds float* %tmp7373, i64 1
+  %tmp7375 = getelementptr inbounds float* %tmp7374, i64 1
+  %tmp7376 = getelementptr inbounds float* %tmp7375, i64 1
+  %tmp7377 = getelementptr inbounds float* %tmp7376, i64 1
+  %tmp7378 = getelementptr inbounds float* %tmp7377, i64 1
+  %tmp7379 = getelementptr inbounds float* %tmp7378, i64 1
+  %tmp7380 = getelementptr inbounds float* %tmp7379, i64 1
+  %tmp7381 = getelementptr inbounds float* %tmp7380, i64 1
+  %tmp7382 = getelementptr inbounds float* %tmp7381, i64 1
+  %tmp7383 = getelementptr inbounds float* %tmp7382, i64 1
+  %tmp7384 = getelementptr inbounds float* %tmp7383, i64 1
+  %tmp7385 = getelementptr inbounds float* %tmp7384, i64 1
+  %tmp7386 = getelementptr inbounds float* %tmp7385, i64 1
+  %tmp7387 = getelementptr inbounds float* %tmp7386, i64 1
+  %tmp7388 = getelementptr inbounds float* %tmp7387, i64 1
+  %tmp7389 = getelementptr inbounds float* %tmp7388, i64 1
+  %tmp7390 = getelementptr inbounds float* %tmp7389, i64 1
+  %tmp7391 = getelementptr inbounds float* %tmp7390, i64 1
+  %tmp7392 = getelementptr inbounds float* %tmp7391, i64 1
+  %tmp7393 = getelementptr inbounds float* %tmp7392, i64 1
+  %tmp7394 = getelementptr inbounds float* %tmp7393, i64 1
+  %tmp7395 = getelementptr inbounds float* %tmp7394, i64 1
+  %tmp7396 = getelementptr inbounds float* %tmp7395, i64 1
+  %tmp7397 = getelementptr inbounds float* %tmp7396, i64 1
+  %tmp7398 = getelementptr inbounds float* %tmp7397, i64 1
+  %tmp7399 = getelementptr inbounds float* %tmp7398, i64 1
+  %tmp7400 = getelementptr inbounds float* %tmp7399, i64 1
+  %tmp7401 = getelementptr inbounds float* %tmp7400, i64 1
+  %tmp7402 = getelementptr inbounds float* %tmp7401, i64 1
+  %tmp7403 = getelementptr inbounds float* %tmp7402, i64 1
+  %tmp7404 = getelementptr inbounds float* %tmp7403, i64 1
+  %tmp7405 = getelementptr inbounds float* %tmp7404, i64 1
+  %tmp7406 = getelementptr inbounds float* %tmp7405, i64 1
+  %tmp7407 = getelementptr inbounds float* %tmp7406, i64 1
+  %tmp7408 = getelementptr inbounds float* %tmp7407, i64 1
+  %tmp7409 = getelementptr inbounds float* %tmp7408, i64 1
+  %tmp7410 = getelementptr inbounds float* %tmp7409, i64 1
+  %tmp7411 = getelementptr inbounds float* %tmp7410, i64 1
+  %tmp7412 = getelementptr inbounds float* %tmp7411, i64 1
+  %tmp7413 = getelementptr inbounds float* %tmp7412, i64 1
+  %tmp7414 = getelementptr inbounds float* %tmp7413, i64 1
+  %tmp7415 = getelementptr inbounds float* %tmp7414, i64 1
+  %tmp7416 = getelementptr inbounds float* %tmp7415, i64 1
+  %tmp7417 = getelementptr inbounds float* %tmp7416, i64 1
+  %tmp7418 = getelementptr inbounds float* %tmp7417, i64 1
+  %tmp7419 = getelementptr inbounds float* %tmp7418, i64 1
+  %tmp7420 = getelementptr inbounds float* %tmp7419, i64 1
+  %tmp7421 = getelementptr inbounds float* %tmp7420, i64 1
+  %tmp7422 = getelementptr inbounds float* %tmp7421, i64 1
+  %tmp7423 = getelementptr inbounds float* %tmp7422, i64 1
+  %tmp7424 = getelementptr inbounds float* %tmp7423, i64 1
+  %tmp7425 = getelementptr inbounds float* %tmp7424, i64 1
+  %tmp7426 = getelementptr inbounds float* %tmp7425, i64 1
+  %tmp7427 = getelementptr inbounds float* %tmp7426, i64 1
+  %tmp7428 = getelementptr inbounds float* %tmp7427, i64 1
+  %tmp7429 = getelementptr inbounds float* %tmp7428, i64 1
+  %tmp7430 = getelementptr inbounds float* %tmp7429, i64 1
+  %tmp7431 = getelementptr inbounds float* %tmp7430, i64 1
+  %tmp7432 = getelementptr inbounds float* %tmp7431, i64 1
+  %tmp7433 = getelementptr inbounds float* %tmp7432, i64 1
+  %tmp7434 = getelementptr inbounds float* %tmp7433, i64 1
+  %tmp7435 = getelementptr inbounds float* %tmp7434, i64 1
+  %tmp7436 = getelementptr inbounds float* %tmp7435, i64 1
+  %tmp7437 = getelementptr inbounds float* %tmp7436, i64 1
+  %tmp7438 = getelementptr inbounds float* %tmp7437, i64 1
+  %tmp7439 = getelementptr inbounds float* %tmp7438, i64 1
+  %tmp7440 = getelementptr inbounds float* %tmp7439, i64 1
+  %tmp7441 = getelementptr inbounds float* %tmp7440, i64 1
+  %tmp7442 = getelementptr inbounds float* %tmp7441, i64 1
+  %tmp7443 = getelementptr inbounds float* %tmp7442, i64 1
+  %tmp7444 = getelementptr inbounds float* %tmp7443, i64 1
+  %tmp7445 = getelementptr inbounds float* %tmp7444, i64 1
+  %tmp7446 = getelementptr inbounds float* %tmp7445, i64 1
+  %tmp7447 = getelementptr inbounds float* %tmp7446, i64 1
+  %tmp7448 = getelementptr inbounds float* %tmp7447, i64 1
+  %tmp7449 = getelementptr inbounds float* %tmp7448, i64 1
+  %tmp7450 = getelementptr inbounds float* %tmp7449, i64 1
+  %tmp7451 = getelementptr inbounds float* %tmp7450, i64 1
+  %tmp7452 = getelementptr inbounds float* %tmp7451, i64 1
+  %tmp7453 = getelementptr inbounds float* %tmp7452, i64 1
+  %tmp7454 = getelementptr inbounds float* %tmp7453, i64 1
+  %tmp7455 = getelementptr inbounds float* %tmp7454, i64 1
+  %tmp7456 = getelementptr inbounds float* %tmp7455, i64 1
+  %tmp7457 = getelementptr inbounds float* %tmp7456, i64 1
+  %tmp7458 = getelementptr inbounds float* %tmp7457, i64 1
+  %tmp7459 = getelementptr inbounds float* %tmp7458, i64 1
+  %tmp7460 = getelementptr inbounds float* %tmp7459, i64 1
+  %tmp7461 = getelementptr inbounds float* %tmp7460, i64 1
+  %tmp7462 = getelementptr inbounds float* %tmp7461, i64 1
+  %tmp7463 = getelementptr inbounds float* %tmp7462, i64 1
+  %tmp7464 = getelementptr inbounds float* %tmp7463, i64 1
+  %tmp7465 = getelementptr inbounds float* %tmp7464, i64 1
+  %tmp7466 = getelementptr inbounds float* %tmp7465, i64 1
+  %tmp7467 = getelementptr inbounds float* %tmp7466, i64 1
+  %tmp7468 = getelementptr inbounds float* %tmp7467, i64 1
+  %tmp7469 = getelementptr inbounds float* %tmp7468, i64 1
+  %tmp7470 = getelementptr inbounds float* %tmp7469, i64 1
+  %tmp7471 = getelementptr inbounds float* %tmp7470, i64 1
+  %tmp7472 = getelementptr inbounds float* %tmp7471, i64 1
+  %tmp7473 = getelementptr inbounds float* %tmp7472, i64 1
+  %tmp7474 = getelementptr inbounds float* %tmp7473, i64 1
+  %tmp7475 = getelementptr inbounds float* %tmp7474, i64 1
+  %tmp7476 = getelementptr inbounds float* %tmp7475, i64 1
+  %tmp7477 = getelementptr inbounds float* %tmp7476, i64 1
+  %tmp7478 = getelementptr inbounds float* %tmp7477, i64 1
+  %tmp7479 = getelementptr inbounds float* %tmp7478, i64 1
+  %tmp7480 = getelementptr inbounds float* %tmp7479, i64 1
+  %tmp7481 = getelementptr inbounds float* %tmp7480, i64 1
+  %tmp7482 = getelementptr inbounds float* %tmp7481, i64 1
+  %tmp7483 = getelementptr inbounds float* %tmp7482, i64 1
+  %tmp7484 = getelementptr inbounds float* %tmp7483, i64 1
+  %tmp7485 = getelementptr inbounds float* %tmp7484, i64 1
+  %tmp7486 = getelementptr inbounds float* %tmp7485, i64 1
+  %tmp7487 = getelementptr inbounds float* %tmp7486, i64 1
+  %tmp7488 = getelementptr inbounds float* %tmp7487, i64 1
+  %tmp7489 = getelementptr inbounds float* %tmp7488, i64 1
+  %tmp7490 = getelementptr inbounds float* %tmp7489, i64 1
+  %tmp7491 = getelementptr inbounds float* %tmp7490, i64 1
+  %tmp7492 = getelementptr inbounds float* %tmp7491, i64 1
+  %tmp7493 = getelementptr inbounds float* %tmp7492, i64 1
+  %tmp7494 = getelementptr inbounds float* %tmp7493, i64 1
+  %tmp7495 = getelementptr inbounds float* %tmp7494, i64 1
+  %tmp7496 = getelementptr inbounds float* %tmp7495, i64 1
+  %tmp7497 = getelementptr inbounds float* %tmp7496, i64 1
+  %tmp7498 = getelementptr inbounds float* %tmp7497, i64 1
+  %tmp7499 = getelementptr inbounds float* %tmp7498, i64 1
+  %tmp7500 = getelementptr inbounds float* %tmp7499, i64 1
+  %tmp7501 = getelementptr inbounds float* %tmp7500, i64 1
+  %tmp7502 = getelementptr inbounds float* %tmp7501, i64 1
+  %tmp7503 = getelementptr inbounds float* %tmp7502, i64 1
+  %tmp7504 = getelementptr inbounds float* %tmp7503, i64 1
+  %tmp7505 = getelementptr inbounds float* %tmp7504, i64 1
+  %tmp7506 = getelementptr inbounds float* %tmp7505, i64 1
+  %tmp7507 = getelementptr inbounds float* %tmp7506, i64 1
+  %tmp7508 = getelementptr inbounds float* %tmp7507, i64 1
+  %tmp7509 = getelementptr inbounds float* %tmp7508, i64 1
+  %tmp7510 = getelementptr inbounds float* %tmp7509, i64 1
+  %tmp7511 = getelementptr inbounds float* %tmp7510, i64 1
+  %tmp7512 = getelementptr inbounds float* %tmp7511, i64 1
+  %tmp7513 = getelementptr inbounds float* %tmp7512, i64 1
+  %tmp7514 = getelementptr inbounds float* %tmp7513, i64 1
+  %tmp7515 = getelementptr inbounds float* %tmp7514, i64 1
+  %tmp7516 = getelementptr inbounds float* %tmp7515, i64 1
+  %tmp7517 = getelementptr inbounds float* %tmp7516, i64 1
+  %tmp7518 = getelementptr inbounds float* %tmp7517, i64 1
+  %tmp7519 = getelementptr inbounds float* %tmp7518, i64 1
+  %tmp7520 = getelementptr inbounds float* %tmp7519, i64 1
+  %tmp7521 = getelementptr inbounds float* %tmp7520, i64 1
+  %tmp7522 = getelementptr inbounds float* %tmp7521, i64 1
+  %tmp7523 = getelementptr inbounds float* %tmp7522, i64 1
+  %tmp7524 = getelementptr inbounds float* %tmp7523, i64 1
+  %tmp7525 = getelementptr inbounds float* %tmp7524, i64 1
+  %tmp7526 = getelementptr inbounds float* %tmp7525, i64 1
+  %tmp7527 = getelementptr inbounds float* %tmp7526, i64 1
+  %tmp7528 = getelementptr inbounds float* %tmp7527, i64 1
+  %tmp7529 = getelementptr inbounds float* %tmp7528, i64 1
+  %tmp7530 = getelementptr inbounds float* %tmp7529, i64 1
+  %tmp7531 = getelementptr inbounds float* %tmp7530, i64 1
+  %tmp7532 = getelementptr inbounds float* %tmp7531, i64 1
+  %tmp7533 = getelementptr inbounds float* %tmp7532, i64 1
+  %tmp7534 = getelementptr inbounds float* %tmp7533, i64 1
+  %tmp7535 = getelementptr inbounds float* %tmp7534, i64 1
+  %tmp7536 = getelementptr inbounds float* %tmp7535, i64 1
+  %tmp7537 = getelementptr inbounds float* %tmp7536, i64 1
+  %tmp7538 = getelementptr inbounds float* %tmp7537, i64 1
+  %tmp7539 = getelementptr inbounds float* %tmp7538, i64 1
+  %tmp7540 = getelementptr inbounds float* %tmp7539, i64 1
+  %tmp7541 = getelementptr inbounds float* %tmp7540, i64 1
+  %tmp7542 = getelementptr inbounds float* %tmp7541, i64 1
+  %tmp7543 = getelementptr inbounds float* %tmp7542, i64 1
+  %tmp7544 = getelementptr inbounds float* %tmp7543, i64 1
+  %tmp7545 = getelementptr inbounds float* %tmp7544, i64 1
+  %tmp7546 = getelementptr inbounds float* %tmp7545, i64 1
+  %tmp7547 = getelementptr inbounds float* %tmp7546, i64 1
+  %tmp7548 = getelementptr inbounds float* %tmp7547, i64 1
+  %tmp7549 = getelementptr inbounds float* %tmp7548, i64 1
+  %tmp7550 = getelementptr inbounds float* %tmp7549, i64 1
+  %tmp7551 = getelementptr inbounds float* %tmp7550, i64 1
+  %tmp7552 = getelementptr inbounds float* %tmp7551, i64 1
+  %tmp7553 = getelementptr inbounds float* %tmp7552, i64 1
+  %tmp7554 = getelementptr inbounds float* %tmp7553, i64 1
+  %tmp7555 = getelementptr inbounds float* %tmp7554, i64 1
+  %tmp7556 = getelementptr inbounds float* %tmp7555, i64 1
+  %tmp7557 = getelementptr inbounds float* %tmp7556, i64 1
+  %tmp7558 = getelementptr inbounds float* %tmp7557, i64 1
+  %tmp7559 = getelementptr inbounds float* %tmp7558, i64 1
+  %tmp7560 = getelementptr inbounds float* %tmp7559, i64 1
+  %tmp7561 = getelementptr inbounds float* %tmp7560, i64 1
+  %tmp7562 = getelementptr inbounds float* %tmp7561, i64 1
+  %tmp7563 = getelementptr inbounds float* %tmp7562, i64 1
+  %tmp7564 = getelementptr inbounds float* %tmp7563, i64 1
+  %tmp7565 = getelementptr inbounds float* %tmp7564, i64 1
+  %tmp7566 = getelementptr inbounds float* %tmp7565, i64 1
+  %tmp7567 = getelementptr inbounds float* %tmp7566, i64 1
+  %tmp7568 = getelementptr inbounds float* %tmp7567, i64 1
+  %tmp7569 = getelementptr inbounds float* %tmp7568, i64 1
+  %tmp7570 = getelementptr inbounds float* %tmp7569, i64 1
+  %tmp7571 = getelementptr inbounds float* %tmp7570, i64 1
+  %tmp7572 = getelementptr inbounds float* %tmp7571, i64 1
+  %tmp7573 = getelementptr inbounds float* %tmp7572, i64 1
+  %tmp7574 = getelementptr inbounds float* %tmp7573, i64 1
+  %tmp7575 = getelementptr inbounds float* %tmp7574, i64 1
+  %tmp7576 = getelementptr inbounds float* %tmp7575, i64 1
+  %tmp7577 = getelementptr inbounds float* %tmp7576, i64 1
+  %tmp7578 = getelementptr inbounds float* %tmp7577, i64 1
+  %tmp7579 = getelementptr inbounds float* %tmp7578, i64 1
+  %tmp7580 = getelementptr inbounds float* %tmp7579, i64 1
+  %tmp7581 = getelementptr inbounds float* %tmp7580, i64 1
+  %tmp7582 = getelementptr inbounds float* %tmp7581, i64 1
+  %tmp7583 = getelementptr inbounds float* %tmp7582, i64 1
+  %tmp7584 = getelementptr inbounds float* %tmp7583, i64 1
+  %tmp7585 = getelementptr inbounds float* %tmp7584, i64 1
+  %tmp7586 = getelementptr inbounds float* %tmp7585, i64 1
+  %tmp7587 = getelementptr inbounds float* %tmp7586, i64 1
+  %tmp7588 = getelementptr inbounds float* %tmp7587, i64 1
+  %tmp7589 = getelementptr inbounds float* %tmp7588, i64 1
+  %tmp7590 = getelementptr inbounds float* %tmp7589, i64 1
+  %tmp7591 = getelementptr inbounds float* %tmp7590, i64 1
+  %tmp7592 = getelementptr inbounds float* %tmp7591, i64 1
+  %tmp7593 = getelementptr inbounds float* %tmp7592, i64 1
+  %tmp7594 = getelementptr inbounds float* %tmp7593, i64 1
+  %tmp7595 = getelementptr inbounds float* %tmp7594, i64 1
+  %tmp7596 = getelementptr inbounds float* %tmp7595, i64 1
+  %tmp7597 = getelementptr inbounds float* %tmp7596, i64 1
+  %tmp7598 = getelementptr inbounds float* %tmp7597, i64 1
+  %tmp7599 = getelementptr inbounds float* %tmp7598, i64 1
+  %tmp7600 = getelementptr inbounds float* %tmp7599, i64 1
+  %tmp7601 = getelementptr inbounds float* %tmp7600, i64 1
+  %tmp7602 = getelementptr inbounds float* %tmp7601, i64 1
+  %tmp7603 = getelementptr inbounds float* %tmp7602, i64 1
+  %tmp7604 = getelementptr inbounds float* %tmp7603, i64 1
+  %tmp7605 = getelementptr inbounds float* %tmp7604, i64 1
+  %tmp7606 = getelementptr inbounds float* %tmp7605, i64 1
+  %tmp7607 = getelementptr inbounds float* %tmp7606, i64 1
+  %tmp7608 = getelementptr inbounds float* %tmp7607, i64 1
+  %tmp7609 = getelementptr inbounds float* %tmp7608, i64 1
+  %tmp7610 = getelementptr inbounds float* %tmp7609, i64 1
+  %tmp7611 = getelementptr inbounds float* %tmp7610, i64 1
+  %tmp7612 = getelementptr inbounds float* %tmp7611, i64 1
+  %tmp7613 = getelementptr inbounds float* %tmp7612, i64 1
+  %tmp7614 = getelementptr inbounds float* %tmp7613, i64 1
+  %tmp7615 = getelementptr inbounds float* %tmp7614, i64 1
+  %tmp7616 = getelementptr inbounds float* %tmp7615, i64 1
+  %tmp7617 = getelementptr inbounds float* %tmp7616, i64 1
+  %tmp7618 = getelementptr inbounds float* %tmp7617, i64 1
+  %tmp7619 = getelementptr inbounds float* %tmp7618, i64 1
+  %tmp7620 = getelementptr inbounds float* %tmp7619, i64 1
+  %tmp7621 = getelementptr inbounds float* %tmp7620, i64 1
+  %tmp7622 = getelementptr inbounds float* %tmp7621, i64 1
+  %tmp7623 = getelementptr inbounds float* %tmp7622, i64 1
+  %tmp7624 = getelementptr inbounds float* %tmp7623, i64 1
+  %tmp7625 = getelementptr inbounds float* %tmp7624, i64 1
+  %tmp7626 = getelementptr inbounds float* %tmp7625, i64 1
+  %tmp7627 = getelementptr inbounds float* %tmp7626, i64 1
+  %tmp7628 = getelementptr inbounds float* %tmp7627, i64 1
+  %tmp7629 = getelementptr inbounds float* %tmp7628, i64 1
+  %tmp7630 = getelementptr inbounds float* %tmp7629, i64 1
+  %tmp7631 = getelementptr inbounds float* %tmp7630, i64 1
+  %tmp7632 = getelementptr inbounds float* %tmp7631, i64 1
+  %tmp7633 = getelementptr inbounds float* %tmp7632, i64 1
+  %tmp7634 = getelementptr inbounds float* %tmp7633, i64 1
+  %tmp7635 = getelementptr inbounds float* %tmp7634, i64 1
+  %tmp7636 = getelementptr inbounds float* %tmp7635, i64 1
+  %tmp7637 = getelementptr inbounds float* %tmp7636, i64 1
+  %tmp7638 = getelementptr inbounds float* %tmp7637, i64 1
+  %tmp7639 = getelementptr inbounds float* %tmp7638, i64 1
+  %tmp7640 = getelementptr inbounds float* %tmp7639, i64 1
+  %tmp7641 = getelementptr inbounds float* %tmp7640, i64 1
+  %tmp7642 = getelementptr inbounds float* %tmp7641, i64 1
+  %tmp7643 = getelementptr inbounds float* %tmp7642, i64 1
+  %tmp7644 = getelementptr inbounds float* %tmp7643, i64 1
+  %tmp7645 = getelementptr inbounds float* %tmp7644, i64 1
+  %tmp7646 = getelementptr inbounds float* %tmp7645, i64 1
+  %tmp7647 = getelementptr inbounds float* %tmp7646, i64 1
+  %tmp7648 = getelementptr inbounds float* %tmp7647, i64 1
+  %tmp7649 = getelementptr inbounds float* %tmp7648, i64 1
+  %tmp7650 = getelementptr inbounds float* %tmp7649, i64 1
+  %tmp7651 = getelementptr inbounds float* %tmp7650, i64 1
+  %tmp7652 = getelementptr inbounds float* %tmp7651, i64 1
+  %tmp7653 = getelementptr inbounds float* %tmp7652, i64 1
+  %tmp7654 = getelementptr inbounds float* %tmp7653, i64 1
+  %tmp7655 = getelementptr inbounds float* %tmp7654, i64 1
+  %tmp7656 = getelementptr inbounds float* %tmp7655, i64 1
+  %tmp7657 = getelementptr inbounds float* %tmp7656, i64 1
+  %tmp7658 = getelementptr inbounds float* %tmp7657, i64 1
+  %tmp7659 = getelementptr inbounds float* %tmp7658, i64 1
+  %tmp7660 = getelementptr inbounds float* %tmp7659, i64 1
+  %tmp7661 = getelementptr inbounds float* %tmp7660, i64 1
+  %tmp7662 = getelementptr inbounds float* %tmp7661, i64 1
+  %tmp7663 = getelementptr inbounds float* %tmp7662, i64 1
+  %tmp7664 = getelementptr inbounds float* %tmp7663, i64 1
+  %tmp7665 = getelementptr inbounds float* %tmp7664, i64 1
+  %tmp7666 = getelementptr inbounds float* %tmp7665, i64 1
+  %tmp7667 = getelementptr inbounds float* %tmp7666, i64 1
+  %tmp7668 = getelementptr inbounds float* %tmp7667, i64 1
+  %tmp7669 = getelementptr inbounds float* %tmp7668, i64 1
+  %tmp7670 = getelementptr inbounds float* %tmp7669, i64 1
+  %tmp7671 = getelementptr inbounds float* %tmp7670, i64 1
+  %tmp7672 = getelementptr inbounds float* %tmp7671, i64 1
+  %tmp7673 = getelementptr inbounds float* %tmp7672, i64 1
+  %tmp7674 = getelementptr inbounds float* %tmp7673, i64 1
+  %tmp7675 = getelementptr inbounds float* %tmp7674, i64 1
+  %tmp7676 = getelementptr inbounds float* %tmp7675, i64 1
+  %tmp7677 = getelementptr inbounds float* %tmp7676, i64 1
+  %tmp7678 = getelementptr inbounds float* %tmp7677, i64 1
+  %tmp7679 = getelementptr inbounds float* %tmp7678, i64 1
+  %tmp7680 = getelementptr inbounds float* %tmp7679, i64 1
+  %tmp7681 = getelementptr inbounds float* %tmp7680, i64 1
+  %tmp7682 = getelementptr inbounds float* %tmp7681, i64 1
+  %tmp7683 = getelementptr inbounds float* %tmp7682, i64 1
+  %tmp7684 = getelementptr inbounds float* %tmp7683, i64 1
+  %tmp7685 = getelementptr inbounds float* %tmp7684, i64 1
+  %tmp7686 = getelementptr inbounds float* %tmp7685, i64 1
+  %tmp7687 = getelementptr inbounds float* %tmp7686, i64 1
+  %tmp7688 = getelementptr inbounds float* %tmp7687, i64 1
+  %tmp7689 = getelementptr inbounds float* %tmp7688, i64 1
+  %tmp7690 = getelementptr inbounds float* %tmp7689, i64 1
+  %tmp7691 = getelementptr inbounds float* %tmp7690, i64 1
+  %tmp7692 = getelementptr inbounds float* %tmp7691, i64 1
+  %tmp7693 = getelementptr inbounds float* %tmp7692, i64 1
+  %tmp7694 = getelementptr inbounds float* %tmp7693, i64 1
+  %tmp7695 = getelementptr inbounds float* %tmp7694, i64 1
+  %tmp7696 = getelementptr inbounds float* %tmp7695, i64 1
+  %tmp7697 = getelementptr inbounds float* %tmp7696, i64 1
+  %tmp7698 = getelementptr inbounds float* %tmp7697, i64 1
+  %tmp7699 = getelementptr inbounds float* %tmp7698, i64 1
+  %tmp7700 = getelementptr inbounds float* %tmp7699, i64 1
+  %tmp7701 = getelementptr inbounds float* %tmp7700, i64 1
+  %tmp7702 = getelementptr inbounds float* %tmp7701, i64 1
+  %tmp7703 = getelementptr inbounds float* %tmp7702, i64 1
+  %tmp7704 = getelementptr inbounds float* %tmp7703, i64 1
+  %tmp7705 = getelementptr inbounds float* %tmp7704, i64 1
+  %tmp7706 = getelementptr inbounds float* %tmp7705, i64 1
+  %tmp7707 = getelementptr inbounds float* %tmp7706, i64 1
+  %tmp7708 = getelementptr inbounds float* %tmp7707, i64 1
+  %tmp7709 = getelementptr inbounds float* %tmp7708, i64 1
+  %tmp7710 = getelementptr inbounds float* %tmp7709, i64 1
+  %tmp7711 = getelementptr inbounds float* %tmp7710, i64 1
+  %tmp7712 = getelementptr inbounds float* %tmp7711, i64 1
+  %tmp7713 = getelementptr inbounds float* %tmp7712, i64 1
+  %tmp7714 = getelementptr inbounds float* %tmp7713, i64 1
+  %tmp7715 = getelementptr inbounds float* %tmp7714, i64 1
+  %tmp7716 = getelementptr inbounds float* %tmp7715, i64 1
+  %tmp7717 = getelementptr inbounds float* %tmp7716, i64 1
+  %tmp7718 = getelementptr inbounds float* %tmp7717, i64 1
+  %tmp7719 = getelementptr inbounds float* %tmp7718, i64 1
+  %tmp7720 = getelementptr inbounds float* %tmp7719, i64 1
+  %tmp7721 = getelementptr inbounds float* %tmp7720, i64 1
+  %tmp7722 = getelementptr inbounds float* %tmp7721, i64 1
+  %tmp7723 = getelementptr inbounds float* %tmp7722, i64 1
+  %tmp7724 = getelementptr inbounds float* %tmp7723, i64 1
+  %tmp7725 = getelementptr inbounds float* %tmp7724, i64 1
+  %tmp7726 = getelementptr inbounds float* %tmp7725, i64 1
+  %tmp7727 = getelementptr inbounds float* %tmp7726, i64 1
+  %tmp7728 = getelementptr inbounds float* %tmp7727, i64 1
+  %tmp7729 = getelementptr inbounds float* %tmp7728, i64 1
+  %tmp7730 = getelementptr inbounds float* %tmp7729, i64 1
+  %tmp7731 = getelementptr inbounds float* %tmp7730, i64 1
+  %tmp7732 = getelementptr inbounds float* %tmp7731, i64 1
+  %tmp7733 = getelementptr inbounds float* %tmp7732, i64 1
+  %tmp7734 = getelementptr inbounds float* %tmp7733, i64 1
+  %tmp7735 = getelementptr inbounds float* %tmp7734, i64 1
+  %tmp7736 = getelementptr inbounds float* %tmp7735, i64 1
+  %tmp7737 = getelementptr inbounds float* %tmp7736, i64 1
+  %tmp7738 = getelementptr inbounds float* %tmp7737, i64 1
+  %tmp7739 = getelementptr inbounds float* %tmp7738, i64 1
+  %tmp7740 = getelementptr inbounds float* %tmp7739, i64 1
+  %tmp7741 = getelementptr inbounds float* %tmp7740, i64 1
+  %tmp7742 = getelementptr inbounds float* %tmp7741, i64 1
+  %tmp7743 = getelementptr inbounds float* %tmp7742, i64 1
+  %tmp7744 = getelementptr inbounds float* %tmp7743, i64 1
+  %tmp7745 = getelementptr inbounds float* %tmp7744, i64 1
+  %tmp7746 = getelementptr inbounds float* %tmp7745, i64 1
+  %tmp7747 = getelementptr inbounds float* %tmp7746, i64 1
+  %tmp7748 = getelementptr inbounds float* %tmp7747, i64 1
+  %tmp7749 = getelementptr inbounds float* %tmp7748, i64 1
+  %tmp7750 = getelementptr inbounds float* %tmp7749, i64 1
+  %tmp7751 = getelementptr inbounds float* %tmp7750, i64 1
+  %tmp7752 = getelementptr inbounds float* %tmp7751, i64 1
+  %tmp7753 = getelementptr inbounds float* %tmp7752, i64 1
+  %tmp7754 = getelementptr inbounds float* %tmp7753, i64 1
+  %tmp7755 = getelementptr inbounds float* %tmp7754, i64 1
+  %tmp7756 = getelementptr inbounds float* %tmp7755, i64 1
+  %tmp7757 = getelementptr inbounds float* %tmp7756, i64 1
+  %tmp7758 = getelementptr inbounds float* %tmp7757, i64 1
+  %tmp7759 = getelementptr inbounds float* %tmp7758, i64 1
+  %tmp7760 = getelementptr inbounds float* %tmp7759, i64 1
+  %tmp7761 = getelementptr inbounds float* %tmp7760, i64 1
+  %tmp7762 = getelementptr inbounds float* %tmp7761, i64 1
+  %tmp7763 = getelementptr inbounds float* %tmp7762, i64 1
+  %tmp7764 = getelementptr inbounds float* %tmp7763, i64 1
+  %tmp7765 = getelementptr inbounds float* %tmp7764, i64 1
+  %tmp7766 = getelementptr inbounds float* %tmp7765, i64 1
+  %tmp7767 = getelementptr inbounds float* %tmp7766, i64 1
+  %tmp7768 = getelementptr inbounds float* %tmp7767, i64 1
+  %tmp7769 = getelementptr inbounds float* %tmp7768, i64 1
+  %tmp7770 = getelementptr inbounds float* %tmp7769, i64 1
+  %tmp7771 = getelementptr inbounds float* %tmp7770, i64 1
+  %tmp7772 = getelementptr inbounds float* %tmp7771, i64 1
+  %tmp7773 = getelementptr inbounds float* %tmp7772, i64 1
+  %tmp7774 = getelementptr inbounds float* %tmp7773, i64 1
+  %tmp7775 = getelementptr inbounds float* %tmp7774, i64 1
+  %tmp7776 = getelementptr inbounds float* %tmp7775, i64 1
+  %tmp7777 = getelementptr inbounds float* %tmp7776, i64 1
+  %tmp7778 = getelementptr inbounds float* %tmp7777, i64 1
+  %tmp7779 = getelementptr inbounds float* %tmp7778, i64 1
+  %tmp7780 = getelementptr inbounds float* %tmp7779, i64 1
+  %tmp7781 = getelementptr inbounds float* %tmp7780, i64 1
+  %tmp7782 = getelementptr inbounds float* %tmp7781, i64 1
+  %tmp7783 = getelementptr inbounds float* %tmp7782, i64 1
+  %tmp7784 = getelementptr inbounds float* %tmp7783, i64 1
+  %tmp7785 = getelementptr inbounds float* %tmp7784, i64 1
+  %tmp7786 = getelementptr inbounds float* %tmp7785, i64 1
+  %tmp7787 = getelementptr inbounds float* %tmp7786, i64 1
+  %tmp7788 = getelementptr inbounds float* %tmp7787, i64 1
+  %tmp7789 = getelementptr inbounds float* %tmp7788, i64 1
+  %tmp7790 = getelementptr inbounds float* %tmp7789, i64 1
+  %tmp7791 = getelementptr inbounds float* %tmp7790, i64 1
+  %tmp7792 = getelementptr inbounds float* %tmp7791, i64 1
+  %tmp7793 = getelementptr inbounds float* %tmp7792, i64 1
+  %tmp7794 = getelementptr inbounds float* %tmp7793, i64 1
+  %tmp7795 = getelementptr inbounds float* %tmp7794, i64 1
+  %tmp7796 = getelementptr inbounds float* %tmp7795, i64 1
+  %tmp7797 = getelementptr inbounds float* %tmp7796, i64 1
+  %tmp7798 = getelementptr inbounds float* %tmp7797, i64 1
+  %tmp7799 = getelementptr inbounds float* %tmp7798, i64 1
+  %tmp7800 = getelementptr inbounds float* %tmp7799, i64 1
+  %tmp7801 = getelementptr inbounds float* %tmp7800, i64 1
+  %tmp7802 = getelementptr inbounds float* %tmp7801, i64 1
+  %tmp7803 = getelementptr inbounds float* %tmp7802, i64 1
+  %tmp7804 = getelementptr inbounds float* %tmp7803, i64 1
+  %tmp7805 = getelementptr inbounds float* %tmp7804, i64 1
+  %tmp7806 = getelementptr inbounds float* %tmp7805, i64 1
+  %tmp7807 = getelementptr inbounds float* %tmp7806, i64 1
+  %tmp7808 = getelementptr inbounds float* %tmp7807, i64 1
+  %tmp7809 = getelementptr inbounds float* %tmp7808, i64 1
+  %tmp7810 = getelementptr inbounds float* %tmp7809, i64 1
+  %tmp7811 = getelementptr inbounds float* %tmp7810, i64 1
+  %tmp7812 = getelementptr inbounds float* %tmp7811, i64 1
+  %tmp7813 = getelementptr inbounds float* %tmp7812, i64 1
+  %tmp7814 = getelementptr inbounds float* %tmp7813, i64 1
+  %tmp7815 = getelementptr inbounds float* %tmp7814, i64 1
+  %tmp7816 = getelementptr inbounds float* %tmp7815, i64 1
+  %tmp7817 = getelementptr inbounds float* %tmp7816, i64 1
+  %tmp7818 = getelementptr inbounds float* %tmp7817, i64 1
+  %tmp7819 = getelementptr inbounds float* %tmp7818, i64 1
+  %tmp7820 = getelementptr inbounds float* %tmp7819, i64 1
+  %tmp7821 = getelementptr inbounds float* %tmp7820, i64 1
+  %tmp7822 = getelementptr inbounds float* %tmp7821, i64 1
+  %tmp7823 = getelementptr inbounds float* %tmp7822, i64 1
+  %tmp7824 = getelementptr inbounds float* %tmp7823, i64 1
+  %tmp7825 = getelementptr inbounds float* %tmp7824, i64 1
+  %tmp7826 = getelementptr inbounds float* %tmp7825, i64 1
+  %tmp7827 = getelementptr inbounds float* %tmp7826, i64 1
+  %tmp7828 = getelementptr inbounds float* %tmp7827, i64 1
+  %tmp7829 = getelementptr inbounds float* %tmp7828, i64 1
+  %tmp7830 = getelementptr inbounds float* %tmp7829, i64 1
+  %tmp7831 = getelementptr inbounds float* %tmp7830, i64 1
+  %tmp7832 = getelementptr inbounds float* %tmp7831, i64 1
+  %tmp7833 = getelementptr inbounds float* %tmp7832, i64 1
+  %tmp7834 = getelementptr inbounds float* %tmp7833, i64 1
+  %tmp7835 = getelementptr inbounds float* %tmp7834, i64 1
+  %tmp7836 = getelementptr inbounds float* %tmp7835, i64 1
+  %tmp7837 = getelementptr inbounds float* %tmp7836, i64 1
+  %tmp7838 = getelementptr inbounds float* %tmp7837, i64 1
+  %tmp7839 = getelementptr inbounds float* %tmp7838, i64 1
+  %tmp7840 = getelementptr inbounds float* %tmp7839, i64 1
+  %tmp7841 = getelementptr inbounds float* %tmp7840, i64 1
+  %tmp7842 = getelementptr inbounds float* %tmp7841, i64 1
+  %tmp7843 = getelementptr inbounds float* %tmp7842, i64 1
+  %tmp7844 = getelementptr inbounds float* %tmp7843, i64 1
+  %tmp7845 = getelementptr inbounds float* %tmp7844, i64 1
+  %tmp7846 = getelementptr inbounds float* %tmp7845, i64 1
+  %tmp7847 = getelementptr inbounds float* %tmp7846, i64 1
+  %tmp7848 = getelementptr inbounds float* %tmp7847, i64 1
+  %tmp7849 = getelementptr inbounds float* %tmp7848, i64 1
+  %tmp7850 = getelementptr inbounds float* %tmp7849, i64 1
+  %tmp7851 = getelementptr inbounds float* %tmp7850, i64 1
+  %tmp7852 = getelementptr inbounds float* %tmp7851, i64 1
+  %tmp7853 = getelementptr inbounds float* %tmp7852, i64 1
+  %tmp7854 = getelementptr inbounds float* %tmp7853, i64 1
+  %tmp7855 = getelementptr inbounds float* %tmp7854, i64 1
+  %tmp7856 = getelementptr inbounds float* %tmp7855, i64 1
+  %tmp7857 = getelementptr inbounds float* %tmp7856, i64 1
+  %tmp7858 = getelementptr inbounds float* %tmp7857, i64 1
+  %tmp7859 = getelementptr inbounds float* %tmp7858, i64 1
+  %tmp7860 = getelementptr inbounds float* %tmp7859, i64 1
+  %tmp7861 = getelementptr inbounds float* %tmp7860, i64 1
+  %tmp7862 = getelementptr inbounds float* %tmp7861, i64 1
+  %tmp7863 = getelementptr inbounds float* %tmp7862, i64 1
+  %tmp7864 = getelementptr inbounds float* %tmp7863, i64 1
+  %tmp7865 = getelementptr inbounds float* %tmp7864, i64 1
+  %tmp7866 = getelementptr inbounds float* %tmp7865, i64 1
+  %tmp7867 = getelementptr inbounds float* %tmp7866, i64 1
+  %tmp7868 = getelementptr inbounds float* %tmp7867, i64 1
+  %tmp7869 = getelementptr inbounds float* %tmp7868, i64 1
+  %tmp7870 = getelementptr inbounds float* %tmp7869, i64 1
+  %tmp7871 = getelementptr inbounds float* %tmp7870, i64 1
+  %tmp7872 = getelementptr inbounds float* %tmp7871, i64 1
+  %tmp7873 = getelementptr inbounds float* %tmp7872, i64 1
+  %tmp7874 = getelementptr inbounds float* %tmp7873, i64 1
+  %tmp7875 = getelementptr inbounds float* %tmp7874, i64 1
+  %tmp7876 = getelementptr inbounds float* %tmp7875, i64 1
+  %tmp7877 = getelementptr inbounds float* %tmp7876, i64 1
+  %tmp7878 = getelementptr inbounds float* %tmp7877, i64 1
+  %tmp7879 = getelementptr inbounds float* %tmp7878, i64 1
+  %tmp7880 = getelementptr inbounds float* %tmp7879, i64 1
+  %tmp7881 = getelementptr inbounds float* %tmp7880, i64 1
+  %tmp7882 = getelementptr inbounds float* %tmp7881, i64 1
+  %tmp7883 = getelementptr inbounds float* %tmp7882, i64 1
+  %tmp7884 = getelementptr inbounds float* %tmp7883, i64 1
+  %tmp7885 = getelementptr inbounds float* %tmp7884, i64 1
+  %tmp7886 = getelementptr inbounds float* %tmp7885, i64 1
+  %tmp7887 = getelementptr inbounds float* %tmp7886, i64 1
+  %tmp7888 = getelementptr inbounds float* %tmp7887, i64 1
+  %tmp7889 = getelementptr inbounds float* %tmp7888, i64 1
+  %tmp7890 = getelementptr inbounds float* %tmp7889, i64 1
+  %tmp7891 = getelementptr inbounds float* %tmp7890, i64 1
+  %tmp7892 = getelementptr inbounds float* %tmp7891, i64 1
+  %tmp7893 = getelementptr inbounds float* %tmp7892, i64 1
+  %tmp7894 = getelementptr inbounds float* %tmp7893, i64 1
+  %tmp7895 = getelementptr inbounds float* %tmp7894, i64 1
+  %tmp7896 = getelementptr inbounds float* %tmp7895, i64 1
+  %tmp7897 = getelementptr inbounds float* %tmp7896, i64 1
+  %tmp7898 = getelementptr inbounds float* %tmp7897, i64 1
+  %tmp7899 = getelementptr inbounds float* %tmp7898, i64 1
+  %tmp7900 = getelementptr inbounds float* %tmp7899, i64 1
+  %tmp7901 = getelementptr inbounds float* %tmp7900, i64 1
+  %tmp7902 = getelementptr inbounds float* %tmp7901, i64 1
+  %tmp7903 = getelementptr inbounds float* %tmp7902, i64 1
+  %tmp7904 = getelementptr inbounds float* %tmp7903, i64 1
+  %tmp7905 = getelementptr inbounds float* %tmp7904, i64 1
+  %tmp7906 = getelementptr inbounds float* %tmp7905, i64 1
+  %tmp7907 = getelementptr inbounds float* %tmp7906, i64 1
+  %tmp7908 = getelementptr inbounds float* %tmp7907, i64 1
+  %tmp7909 = getelementptr inbounds float* %tmp7908, i64 1
+  %tmp7910 = getelementptr inbounds float* %tmp7909, i64 1
+  %tmp7911 = getelementptr inbounds float* %tmp7910, i64 1
+  %tmp7912 = getelementptr inbounds float* %tmp7911, i64 1
+  %tmp7913 = getelementptr inbounds float* %tmp7912, i64 1
+  %tmp7914 = getelementptr inbounds float* %tmp7913, i64 1
+  %tmp7915 = getelementptr inbounds float* %tmp7914, i64 1
+  %tmp7916 = getelementptr inbounds float* %tmp7915, i64 1
+  %tmp7917 = getelementptr inbounds float* %tmp7916, i64 1
+  %tmp7918 = getelementptr inbounds float* %tmp7917, i64 1
+  %tmp7919 = getelementptr inbounds float* %tmp7918, i64 1
+  %tmp7920 = getelementptr inbounds float* %tmp7919, i64 1
+  %tmp7921 = getelementptr inbounds float* %tmp7920, i64 1
+  %tmp7922 = getelementptr inbounds float* %tmp7921, i64 1
+  %tmp7923 = getelementptr inbounds float* %tmp7922, i64 1
+  %tmp7924 = getelementptr inbounds float* %tmp7923, i64 1
+  %tmp7925 = getelementptr inbounds float* %tmp7924, i64 1
+  %tmp7926 = getelementptr inbounds float* %tmp7925, i64 1
+  %tmp7927 = getelementptr inbounds float* %tmp7926, i64 1
+  %tmp7928 = getelementptr inbounds float* %tmp7927, i64 1
+  %tmp7929 = getelementptr inbounds float* %tmp7928, i64 1
+  %tmp7930 = getelementptr inbounds float* %tmp7929, i64 1
+  %tmp7931 = getelementptr inbounds float* %tmp7930, i64 1
+  %tmp7932 = getelementptr inbounds float* %tmp7931, i64 1
+  %tmp7933 = getelementptr inbounds float* %tmp7932, i64 1
+  %tmp7934 = getelementptr inbounds float* %tmp7933, i64 1
+  %tmp7935 = getelementptr inbounds float* %tmp7934, i64 1
+  %tmp7936 = getelementptr inbounds float* %tmp7935, i64 1
+  %tmp7937 = getelementptr inbounds float* %tmp7936, i64 1
+  %tmp7938 = getelementptr inbounds float* %tmp7937, i64 1
+  %tmp7939 = getelementptr inbounds float* %tmp7938, i64 1
+  %tmp7940 = getelementptr inbounds float* %tmp7939, i64 1
+  %tmp7941 = getelementptr inbounds float* %tmp7940, i64 1
+  %tmp7942 = getelementptr inbounds float* %tmp7941, i64 1
+  %tmp7943 = getelementptr inbounds float* %tmp7942, i64 1
+  %tmp7944 = getelementptr inbounds float* %tmp7943, i64 1
+  %tmp7945 = getelementptr inbounds float* %tmp7944, i64 1
+  %tmp7946 = getelementptr inbounds float* %tmp7945, i64 1
+  %tmp7947 = getelementptr inbounds float* %tmp7946, i64 1
+  %tmp7948 = getelementptr inbounds float* %tmp7947, i64 1
+  %tmp7949 = getelementptr inbounds float* %tmp7948, i64 1
+  %tmp7950 = getelementptr inbounds float* %tmp7949, i64 1
+  %tmp7951 = getelementptr inbounds float* %tmp7950, i64 1
+  %tmp7952 = getelementptr inbounds float* %tmp7951, i64 1
+  %tmp7953 = getelementptr inbounds float* %tmp7952, i64 1
+  %tmp7954 = getelementptr inbounds float* %tmp7953, i64 1
+  %tmp7955 = getelementptr inbounds float* %tmp7954, i64 1
+  %tmp7956 = getelementptr inbounds float* %tmp7955, i64 1
+  %tmp7957 = getelementptr inbounds float* %tmp7956, i64 1
+  %tmp7958 = getelementptr inbounds float* %tmp7957, i64 1
+  %tmp7959 = getelementptr inbounds float* %tmp7958, i64 1
+  %tmp7960 = getelementptr inbounds float* %tmp7959, i64 1
+  %tmp7961 = getelementptr inbounds float* %tmp7960, i64 1
+  %tmp7962 = getelementptr inbounds float* %tmp7961, i64 1
+  %tmp7963 = getelementptr inbounds float* %tmp7962, i64 1
+  %tmp7964 = getelementptr inbounds float* %tmp7963, i64 1
+  %tmp7965 = getelementptr inbounds float* %tmp7964, i64 1
+  %tmp7966 = getelementptr inbounds float* %tmp7965, i64 1
+  %tmp7967 = getelementptr inbounds float* %tmp7966, i64 1
+  %tmp7968 = getelementptr inbounds float* %tmp7967, i64 1
+  %tmp7969 = getelementptr inbounds float* %tmp7968, i64 1
+  %tmp7970 = getelementptr inbounds float* %tmp7969, i64 1
+  %tmp7971 = getelementptr inbounds float* %tmp7970, i64 1
+  %tmp7972 = getelementptr inbounds float* %tmp7971, i64 1
+  %tmp7973 = getelementptr inbounds float* %tmp7972, i64 1
+  %tmp7974 = getelementptr inbounds float* %tmp7973, i64 1
+  %tmp7975 = getelementptr inbounds float* %tmp7974, i64 1
+  %tmp7976 = getelementptr inbounds float* %tmp7975, i64 1
+  %tmp7977 = getelementptr inbounds float* %tmp7976, i64 1
+  %tmp7978 = getelementptr inbounds float* %tmp7977, i64 1
+  %tmp7979 = getelementptr inbounds float* %tmp7978, i64 1
+  %tmp7980 = getelementptr inbounds float* %tmp7979, i64 1
+  %tmp7981 = getelementptr inbounds float* %tmp7980, i64 1
+  %tmp7982 = getelementptr inbounds float* %tmp7981, i64 1
+  %tmp7983 = getelementptr inbounds float* %tmp7982, i64 1
+  %tmp7984 = getelementptr inbounds float* %tmp7983, i64 1
+  %tmp7985 = getelementptr inbounds float* %tmp7984, i64 1
+  %tmp7986 = getelementptr inbounds float* %tmp7985, i64 1
+  %tmp7987 = getelementptr inbounds float* %tmp7986, i64 1
+  %tmp7988 = getelementptr inbounds float* %tmp7987, i64 1
+  %tmp7989 = getelementptr inbounds float* %tmp7988, i64 1
+  %tmp7990 = getelementptr inbounds float* %tmp7989, i64 1
+  %tmp7991 = getelementptr inbounds float* %tmp7990, i64 1
+  %tmp7992 = getelementptr inbounds float* %tmp7991, i64 1
+  %tmp7993 = getelementptr inbounds float* %tmp7992, i64 1
+  %tmp7994 = getelementptr inbounds float* %tmp7993, i64 1
+  %tmp7995 = getelementptr inbounds float* %tmp7994, i64 1
+  %tmp7996 = getelementptr inbounds float* %tmp7995, i64 1
+  %tmp7997 = getelementptr inbounds float* %tmp7996, i64 1
+  %tmp7998 = getelementptr inbounds float* %tmp7997, i64 1
+  %tmp7999 = getelementptr inbounds float* %tmp7998, i64 1
+  %tmp8000 = getelementptr inbounds float* %tmp7999, i64 1
+  %tmp8001 = getelementptr inbounds float* %tmp8000, i64 1
+  %tmp8002 = getelementptr inbounds float* %tmp8001, i64 1
+  %tmp8003 = getelementptr inbounds float* %tmp8002, i64 1
+  %tmp8004 = getelementptr inbounds float* %tmp8003, i64 1
+  %tmp8005 = getelementptr inbounds float* %tmp8004, i64 1
+  %tmp8006 = getelementptr inbounds float* %tmp8005, i64 1
+  %tmp8007 = getelementptr inbounds float* %tmp8006, i64 1
+  %tmp8008 = getelementptr inbounds float* %tmp8007, i64 1
+  %tmp8009 = getelementptr inbounds float* %tmp8008, i64 1
+  %tmp8010 = getelementptr inbounds float* %tmp8009, i64 1
+  %tmp8011 = getelementptr inbounds float* %tmp8010, i64 1
+  %tmp8012 = getelementptr inbounds float* %tmp8011, i64 1
+  %tmp8013 = getelementptr inbounds float* %tmp8012, i64 1
+  %tmp8014 = getelementptr inbounds float* %tmp8013, i64 1
+  %tmp8015 = getelementptr inbounds float* %tmp8014, i64 1
+  %tmp8016 = getelementptr inbounds float* %tmp8015, i64 1
+  %tmp8017 = getelementptr inbounds float* %tmp8016, i64 1
+  %tmp8018 = getelementptr inbounds float* %tmp8017, i64 1
+  %tmp8019 = getelementptr inbounds float* %tmp8018, i64 1
+  %tmp8020 = getelementptr inbounds float* %tmp8019, i64 1
+  %tmp8021 = getelementptr inbounds float* %tmp8020, i64 1
+  %tmp8022 = getelementptr inbounds float* %tmp8021, i64 1
+  %tmp8023 = getelementptr inbounds float* %tmp8022, i64 1
+  %tmp8024 = getelementptr inbounds float* %tmp8023, i64 1
+  %tmp8025 = getelementptr inbounds float* %tmp8024, i64 1
+  %tmp8026 = getelementptr inbounds float* %tmp8025, i64 1
+  %tmp8027 = getelementptr inbounds float* %tmp8026, i64 1
+  %tmp8028 = getelementptr inbounds float* %tmp8027, i64 1
+  %tmp8029 = getelementptr inbounds float* %tmp8028, i64 1
+  %tmp8030 = getelementptr inbounds float* %tmp8029, i64 1
+  %tmp8031 = getelementptr inbounds float* %tmp8030, i64 1
+  %tmp8032 = getelementptr inbounds float* %tmp8031, i64 1
+  %tmp8033 = getelementptr inbounds float* %tmp8032, i64 1
+  %tmp8034 = getelementptr inbounds float* %tmp8033, i64 1
+  %tmp8035 = getelementptr inbounds float* %tmp8034, i64 1
+  %tmp8036 = getelementptr inbounds float* %tmp8035, i64 1
+  %tmp8037 = getelementptr inbounds float* %tmp8036, i64 1
+  %tmp8038 = getelementptr inbounds float* %tmp8037, i64 1
+  %tmp8039 = getelementptr inbounds float* %tmp8038, i64 1
+  %tmp8040 = getelementptr inbounds float* %tmp8039, i64 1
+  %tmp8041 = getelementptr inbounds float* %tmp8040, i64 1
+  %tmp8042 = getelementptr inbounds float* %tmp8041, i64 1
+  %tmp8043 = getelementptr inbounds float* %tmp8042, i64 1
+  %tmp8044 = getelementptr inbounds float* %tmp8043, i64 1
+  %tmp8045 = getelementptr inbounds float* %tmp8044, i64 1
+  %tmp8046 = getelementptr inbounds float* %tmp8045, i64 1
+  %tmp8047 = getelementptr inbounds float* %tmp8046, i64 1
+  %tmp8048 = getelementptr inbounds float* %tmp8047, i64 1
+  %tmp8049 = getelementptr inbounds float* %tmp8048, i64 1
+  %tmp8050 = getelementptr inbounds float* %tmp8049, i64 1
+  %tmp8051 = getelementptr inbounds float* %tmp8050, i64 1
+  %tmp8052 = getelementptr inbounds float* %tmp8051, i64 1
+  %tmp8053 = getelementptr inbounds float* %tmp8052, i64 1
+  %tmp8054 = getelementptr inbounds float* %tmp8053, i64 1
+  %tmp8055 = getelementptr inbounds float* %tmp8054, i64 1
+  %tmp8056 = getelementptr inbounds float* %tmp8055, i64 1
+  %tmp8057 = getelementptr inbounds float* %tmp8056, i64 1
+  %tmp8058 = getelementptr inbounds float* %tmp8057, i64 1
+  %tmp8059 = getelementptr inbounds float* %tmp8058, i64 1
+  %tmp8060 = getelementptr inbounds float* %tmp8059, i64 1
+  %tmp8061 = getelementptr inbounds float* %tmp8060, i64 1
+  %tmp8062 = getelementptr inbounds float* %tmp8061, i64 1
+  %tmp8063 = getelementptr inbounds float* %tmp8062, i64 1
+  %tmp8064 = getelementptr inbounds float* %tmp8063, i64 1
+  %tmp8065 = getelementptr inbounds float* %tmp8064, i64 1
+  %tmp8066 = getelementptr inbounds float* %tmp8065, i64 1
+  %tmp8067 = getelementptr inbounds float* %tmp8066, i64 1
+  %tmp8068 = getelementptr inbounds float* %tmp8067, i64 1
+  %tmp8069 = getelementptr inbounds float* %tmp8068, i64 1
+  %tmp8070 = getelementptr inbounds float* %tmp8069, i64 1
+  %tmp8071 = getelementptr inbounds float* %tmp8070, i64 1
+  %tmp8072 = getelementptr inbounds float* %tmp8071, i64 1
+  %tmp8073 = getelementptr inbounds float* %tmp8072, i64 1
+  %tmp8074 = getelementptr inbounds float* %tmp8073, i64 1
+  %tmp8075 = getelementptr inbounds float* %tmp8074, i64 1
+  %tmp8076 = getelementptr inbounds float* %tmp8075, i64 1
+  %tmp8077 = getelementptr inbounds float* %tmp8076, i64 1
+  %tmp8078 = getelementptr inbounds float* %tmp8077, i64 1
+  %tmp8079 = getelementptr inbounds float* %tmp8078, i64 1
+  %tmp8080 = getelementptr inbounds float* %tmp8079, i64 1
+  %tmp8081 = getelementptr inbounds float* %tmp8080, i64 1
+  %tmp8082 = getelementptr inbounds float* %tmp8081, i64 1
+  %tmp8083 = getelementptr inbounds float* %tmp8082, i64 1
+  %tmp8084 = getelementptr inbounds float* %tmp8083, i64 1
+  %tmp8085 = getelementptr inbounds float* %tmp8084, i64 1
+  %tmp8086 = getelementptr inbounds float* %tmp8085, i64 1
+  %tmp8087 = getelementptr inbounds float* %tmp8086, i64 1
+  %tmp8088 = getelementptr inbounds float* %tmp8087, i64 1
+  %tmp8089 = getelementptr inbounds float* %tmp8088, i64 1
+  %tmp8090 = getelementptr inbounds float* %tmp8089, i64 1
+  %tmp8091 = getelementptr inbounds float* %tmp8090, i64 1
+  %tmp8092 = getelementptr inbounds float* %tmp8091, i64 1
+  %tmp8093 = getelementptr inbounds float* %tmp8092, i64 1
+  %tmp8094 = getelementptr inbounds float* %tmp8093, i64 1
+  %tmp8095 = getelementptr inbounds float* %tmp8094, i64 1
+  %tmp8096 = getelementptr inbounds float* %tmp8095, i64 1
+  %tmp8097 = getelementptr inbounds float* %tmp8096, i64 1
+  %tmp8098 = getelementptr inbounds float* %tmp8097, i64 1
+  %tmp8099 = getelementptr inbounds float* %tmp8098, i64 1
+  %tmp8100 = getelementptr inbounds float* %tmp8099, i64 1
+  %tmp8101 = getelementptr inbounds float* %tmp8100, i64 1
+  %tmp8102 = getelementptr inbounds float* %tmp8101, i64 1
+  %tmp8103 = getelementptr inbounds float* %tmp8102, i64 1
+  %tmp8104 = getelementptr inbounds float* %tmp8103, i64 1
+  %tmp8105 = getelementptr inbounds float* %tmp8104, i64 1
+  %tmp8106 = getelementptr inbounds float* %tmp8105, i64 1
+  %tmp8107 = getelementptr inbounds float* %tmp8106, i64 1
+  %tmp8108 = getelementptr inbounds float* %tmp8107, i64 1
+  %tmp8109 = getelementptr inbounds float* %tmp8108, i64 1
+  %tmp8110 = getelementptr inbounds float* %tmp8109, i64 1
+  %tmp8111 = getelementptr inbounds float* %tmp8110, i64 1
+  %tmp8112 = getelementptr inbounds float* %tmp8111, i64 1
+  %tmp8113 = getelementptr inbounds float* %tmp8112, i64 1
+  %tmp8114 = getelementptr inbounds float* %tmp8113, i64 1
+  %tmp8115 = getelementptr inbounds float* %tmp8114, i64 1
+  %tmp8116 = getelementptr inbounds float* %tmp8115, i64 1
+  %tmp8117 = getelementptr inbounds float* %tmp8116, i64 1
+  %tmp8118 = getelementptr inbounds float* %tmp8117, i64 1
+  %tmp8119 = getelementptr inbounds float* %tmp8118, i64 1
+  %tmp8120 = getelementptr inbounds float* %tmp8119, i64 1
+  %tmp8121 = getelementptr inbounds float* %tmp8120, i64 1
+  %tmp8122 = getelementptr inbounds float* %tmp8121, i64 1
+  %tmp8123 = getelementptr inbounds float* %tmp8122, i64 1
+  %tmp8124 = getelementptr inbounds float* %tmp8123, i64 1
+  %tmp8125 = getelementptr inbounds float* %tmp8124, i64 1
+  %tmp8126 = getelementptr inbounds float* %tmp8125, i64 1
+  %tmp8127 = getelementptr inbounds float* %tmp8126, i64 1
+  %tmp8128 = getelementptr inbounds float* %tmp8127, i64 1
+  %tmp8129 = getelementptr inbounds float* %tmp8128, i64 1
+  %tmp8130 = getelementptr inbounds float* %tmp8129, i64 1
+  %tmp8131 = getelementptr inbounds float* %tmp8130, i64 1
+  %tmp8132 = getelementptr inbounds float* %tmp8131, i64 1
+  %tmp8133 = getelementptr inbounds float* %tmp8132, i64 1
+  %tmp8134 = getelementptr inbounds float* %tmp8133, i64 1
+  %tmp8135 = getelementptr inbounds float* %tmp8134, i64 1
+  %tmp8136 = getelementptr inbounds float* %tmp8135, i64 1
+  %tmp8137 = getelementptr inbounds float* %tmp8136, i64 1
+  %tmp8138 = getelementptr inbounds float* %tmp8137, i64 1
+  %tmp8139 = getelementptr inbounds float* %tmp8138, i64 1
+  %tmp8140 = getelementptr inbounds float* %tmp8139, i64 1
+  %tmp8141 = getelementptr inbounds float* %tmp8140, i64 1
+  %tmp8142 = getelementptr inbounds float* %tmp8141, i64 1
+  %tmp8143 = getelementptr inbounds float* %tmp8142, i64 1
+  %tmp8144 = getelementptr inbounds float* %tmp8143, i64 1
+  %tmp8145 = getelementptr inbounds float* %tmp8144, i64 1
+  %tmp8146 = getelementptr inbounds float* %tmp8145, i64 1
+  %tmp8147 = getelementptr inbounds float* %tmp8146, i64 1
+  %tmp8148 = getelementptr inbounds float* %tmp8147, i64 1
+  %tmp8149 = getelementptr inbounds float* %tmp8148, i64 1
+  %tmp8150 = getelementptr inbounds float* %tmp8149, i64 1
+  %tmp8151 = getelementptr inbounds float* %tmp8150, i64 1
+  %tmp8152 = getelementptr inbounds float* %tmp8151, i64 1
+  %tmp8153 = getelementptr inbounds float* %tmp8152, i64 1
+  %tmp8154 = getelementptr inbounds float* %tmp8153, i64 1
+  %tmp8155 = getelementptr inbounds float* %tmp8154, i64 1
+  %tmp8156 = getelementptr inbounds float* %tmp8155, i64 1
+  %tmp8157 = getelementptr inbounds float* %tmp8156, i64 1
+  %tmp8158 = getelementptr inbounds float* %tmp8157, i64 1
+  %tmp8159 = getelementptr inbounds float* %tmp8158, i64 1
+  %tmp8160 = getelementptr inbounds float* %tmp8159, i64 1
+  %tmp8161 = getelementptr inbounds float* %tmp8160, i64 1
+  %tmp8162 = getelementptr inbounds float* %tmp8161, i64 1
+  %tmp8163 = getelementptr inbounds float* %tmp8162, i64 1
+  %tmp8164 = getelementptr inbounds float* %tmp8163, i64 1
+  %tmp8165 = getelementptr inbounds float* %tmp8164, i64 1
+  %tmp8166 = getelementptr inbounds float* %tmp8165, i64 1
+  %tmp8167 = getelementptr inbounds float* %tmp8166, i64 1
+  %tmp8168 = getelementptr inbounds float* %tmp8167, i64 1
+  %tmp8169 = getelementptr inbounds float* %tmp8168, i64 1
+  %tmp8170 = getelementptr inbounds float* %tmp8169, i64 1
+  %tmp8171 = getelementptr inbounds float* %tmp8170, i64 1
+  %tmp8172 = getelementptr inbounds float* %tmp8171, i64 1
+  %tmp8173 = getelementptr inbounds float* %tmp8172, i64 1
+  %tmp8174 = getelementptr inbounds float* %tmp8173, i64 1
+  %tmp8175 = getelementptr inbounds float* %tmp8174, i64 1
+  %tmp8176 = getelementptr inbounds float* %tmp8175, i64 1
+  %tmp8177 = getelementptr inbounds float* %tmp8176, i64 1
+  %tmp8178 = getelementptr inbounds float* %tmp8177, i64 1
+  %tmp8179 = getelementptr inbounds float* %tmp8178, i64 1
+  %tmp8180 = getelementptr inbounds float* %tmp8179, i64 1
+  %tmp8181 = getelementptr inbounds float* %tmp8180, i64 1
+  %tmp8182 = getelementptr inbounds float* %tmp8181, i64 1
+  %tmp8183 = getelementptr inbounds float* %tmp8182, i64 1
+  %tmp8184 = getelementptr inbounds float* %tmp8183, i64 1
+  %tmp8185 = getelementptr inbounds float* %tmp8184, i64 1
+  %tmp8186 = getelementptr inbounds float* %tmp8185, i64 1
+  %tmp8187 = getelementptr inbounds float* %tmp8186, i64 1
+  %tmp8188 = getelementptr inbounds float* %tmp8187, i64 1
+  %tmp8189 = getelementptr inbounds float* %tmp8188, i64 1
+  %tmp8190 = getelementptr inbounds float* %tmp8189, i64 1
+  %tmp8191 = getelementptr inbounds float* %tmp8190, i64 1
+  %tmp8192 = getelementptr inbounds float* %tmp8191, i64 1
+  %tmp8193 = getelementptr inbounds float* %tmp8192, i64 1
+  %tmp8194 = getelementptr inbounds float* %tmp8193, i64 1
+  %tmp8195 = getelementptr inbounds float* %tmp8194, i64 1
+  %tmp8196 = getelementptr inbounds float* %tmp8195, i64 1
+  %tmp8197 = getelementptr inbounds float* %tmp8196, i64 1
+  %tmp8198 = getelementptr inbounds float* %tmp8197, i64 1
+  %tmp8199 = getelementptr inbounds float* %tmp8198, i64 1
+  %tmp8200 = getelementptr inbounds float* %tmp8199, i64 1
+  %tmp8201 = getelementptr inbounds float* %tmp8200, i64 1
+  %tmp8202 = getelementptr inbounds float* %tmp8201, i64 1
+  %tmp8203 = getelementptr inbounds float* %tmp8202, i64 1
+  %tmp8204 = getelementptr inbounds float* %tmp8203, i64 1
+  %tmp8205 = getelementptr inbounds float* %tmp8204, i64 1
+  %tmp8206 = getelementptr inbounds float* %tmp8205, i64 1
+  %tmp8207 = getelementptr inbounds float* %tmp8206, i64 1
+  %tmp8208 = getelementptr inbounds float* %tmp8207, i64 1
+  %tmp8209 = getelementptr inbounds float* %tmp8208, i64 1
+  %tmp8210 = getelementptr inbounds float* %tmp8209, i64 1
+  %tmp8211 = getelementptr inbounds float* %tmp8210, i64 1
+  %tmp8212 = getelementptr inbounds float* %tmp8211, i64 1
+  %tmp8213 = getelementptr inbounds float* %tmp8212, i64 1
+  %tmp8214 = getelementptr inbounds float* %tmp8213, i64 1
+  %tmp8215 = getelementptr inbounds float* %tmp8214, i64 1
+  %tmp8216 = getelementptr inbounds float* %tmp8215, i64 1
+  %tmp8217 = getelementptr inbounds float* %tmp8216, i64 1
+  %tmp8218 = getelementptr inbounds float* %tmp8217, i64 1
+  %tmp8219 = getelementptr inbounds float* %tmp8218, i64 1
+  %tmp8220 = getelementptr inbounds float* %tmp8219, i64 1
+  %tmp8221 = getelementptr inbounds float* %tmp8220, i64 1
+  %tmp8222 = getelementptr inbounds float* %tmp8221, i64 1
+  %tmp8223 = getelementptr inbounds float* %tmp8222, i64 1
+  %tmp8224 = getelementptr inbounds float* %tmp8223, i64 1
+  %tmp8225 = getelementptr inbounds float* %tmp8224, i64 1
+  %tmp8226 = getelementptr inbounds float* %tmp8225, i64 1
+  %tmp8227 = getelementptr inbounds float* %tmp8226, i64 1
+  %tmp8228 = getelementptr inbounds float* %tmp8227, i64 1
+  %tmp8229 = getelementptr inbounds float* %tmp8228, i64 1
+  %tmp8230 = getelementptr inbounds float* %tmp8229, i64 1
+  %tmp8231 = getelementptr inbounds float* %tmp8230, i64 1
+  %tmp8232 = getelementptr inbounds float* %tmp8231, i64 1
+  %tmp8233 = getelementptr inbounds float* %tmp8232, i64 1
+  %tmp8234 = getelementptr inbounds float* %tmp8233, i64 1
+  %tmp8235 = getelementptr inbounds float* %tmp8234, i64 1
+  %tmp8236 = getelementptr inbounds float* %tmp8235, i64 1
+  %tmp8237 = getelementptr inbounds float* %tmp8236, i64 1
+  %tmp8238 = getelementptr inbounds float* %tmp8237, i64 1
+  %tmp8239 = getelementptr inbounds float* %tmp8238, i64 1
+  %tmp8240 = getelementptr inbounds float* %tmp8239, i64 1
+  %tmp8241 = getelementptr inbounds float* %tmp8240, i64 1
+  %tmp8242 = getelementptr inbounds float* %tmp8241, i64 1
+  %tmp8243 = getelementptr inbounds float* %tmp8242, i64 1
+  %tmp8244 = getelementptr inbounds float* %tmp8243, i64 1
+  %tmp8245 = getelementptr inbounds float* %tmp8244, i64 1
+  %tmp8246 = getelementptr inbounds float* %tmp8245, i64 1
+  %tmp8247 = getelementptr inbounds float* %tmp8246, i64 1
+  %tmp8248 = getelementptr inbounds float* %tmp8247, i64 1
+  %tmp8249 = getelementptr inbounds float* %tmp8248, i64 1
+  %tmp8250 = getelementptr inbounds float* %tmp8249, i64 1
+  %tmp8251 = getelementptr inbounds float* %tmp8250, i64 1
+  %tmp8252 = getelementptr inbounds float* %tmp8251, i64 1
+  %tmp8253 = getelementptr inbounds float* %tmp8252, i64 1
+  %tmp8254 = getelementptr inbounds float* %tmp8253, i64 1
+  %tmp8255 = getelementptr inbounds float* %tmp8254, i64 1
+  %tmp8256 = getelementptr inbounds float* %tmp8255, i64 1
+  %tmp8257 = getelementptr inbounds float* %tmp8256, i64 1
+  %tmp8258 = getelementptr inbounds float* %tmp8257, i64 1
+  %tmp8259 = getelementptr inbounds float* %tmp8258, i64 1
+  %tmp8260 = getelementptr inbounds float* %tmp8259, i64 1
+  %tmp8261 = getelementptr inbounds float* %tmp8260, i64 1
+  %tmp8262 = getelementptr inbounds float* %tmp8261, i64 1
+  %tmp8263 = getelementptr inbounds float* %tmp8262, i64 1
+  %tmp8264 = getelementptr inbounds float* %tmp8263, i64 1
+  %tmp8265 = getelementptr inbounds float* %tmp8264, i64 1
+  %tmp8266 = getelementptr inbounds float* %tmp8265, i64 1
+  %tmp8267 = getelementptr inbounds float* %tmp8266, i64 1
+  %tmp8268 = getelementptr inbounds float* %tmp8267, i64 1
+  %tmp8269 = getelementptr inbounds float* %tmp8268, i64 1
+  %tmp8270 = getelementptr inbounds float* %tmp8269, i64 1
+  %tmp8271 = getelementptr inbounds float* %tmp8270, i64 1
+  %tmp8272 = getelementptr inbounds float* %tmp8271, i64 1
+  %tmp8273 = getelementptr inbounds float* %tmp8272, i64 1
+  %tmp8274 = getelementptr inbounds float* %tmp8273, i64 1
+  %tmp8275 = getelementptr inbounds float* %tmp8274, i64 1
+  %tmp8276 = getelementptr inbounds float* %tmp8275, i64 1
+  %tmp8277 = getelementptr inbounds float* %tmp8276, i64 1
+  %tmp8278 = getelementptr inbounds float* %tmp8277, i64 1
+  %tmp8279 = getelementptr inbounds float* %tmp8278, i64 1
+  %tmp8280 = getelementptr inbounds float* %tmp8279, i64 1
+  %tmp8281 = getelementptr inbounds float* %tmp8280, i64 1
+  %tmp8282 = getelementptr inbounds float* %tmp8281, i64 1
+  %tmp8283 = getelementptr inbounds float* %tmp8282, i64 1
+  %tmp8284 = getelementptr inbounds float* %tmp8283, i64 1
+  %tmp8285 = getelementptr inbounds float* %tmp8284, i64 1
+  %tmp8286 = getelementptr inbounds float* %tmp8285, i64 1
+  %tmp8287 = getelementptr inbounds float* %tmp8286, i64 1
+  %tmp8288 = getelementptr inbounds float* %tmp8287, i64 1
+  %tmp8289 = getelementptr inbounds float* %tmp8288, i64 1
+  %tmp8290 = getelementptr inbounds float* %tmp8289, i64 1
+  %tmp8291 = getelementptr inbounds float* %tmp8290, i64 1
+  %tmp8292 = getelementptr inbounds float* %tmp8291, i64 1
+  %tmp8293 = getelementptr inbounds float* %tmp8292, i64 1
+  %tmp8294 = getelementptr inbounds float* %tmp8293, i64 1
+  %tmp8295 = getelementptr inbounds float* %tmp8294, i64 1
+  %tmp8296 = getelementptr inbounds float* %tmp8295, i64 1
+  %tmp8297 = getelementptr inbounds float* %tmp8296, i64 1
+  %tmp8298 = getelementptr inbounds float* %tmp8297, i64 1
+  %tmp8299 = getelementptr inbounds float* %tmp8298, i64 1
+  %tmp8300 = getelementptr inbounds float* %tmp8299, i64 1
+  %tmp8301 = getelementptr inbounds float* %tmp8300, i64 1
+  %tmp8302 = getelementptr inbounds float* %tmp8301, i64 1
+  %tmp8303 = getelementptr inbounds float* %tmp8302, i64 1
+  %tmp8304 = getelementptr inbounds float* %tmp8303, i64 1
+  %tmp8305 = getelementptr inbounds float* %tmp8304, i64 1
+  %tmp8306 = getelementptr inbounds float* %tmp8305, i64 1
+  %tmp8307 = getelementptr inbounds float* %tmp8306, i64 1
+  %tmp8308 = getelementptr inbounds float* %tmp8307, i64 1
+  %tmp8309 = getelementptr inbounds float* %tmp8308, i64 1
+  %tmp8310 = getelementptr inbounds float* %tmp8309, i64 1
+  %tmp8311 = getelementptr inbounds float* %tmp8310, i64 1
+  %tmp8312 = getelementptr inbounds float* %tmp8311, i64 1
+  %tmp8313 = getelementptr inbounds float* %tmp8312, i64 1
+  %tmp8314 = getelementptr inbounds float* %tmp8313, i64 1
+  %tmp8315 = getelementptr inbounds float* %tmp8314, i64 1
+  %tmp8316 = getelementptr inbounds float* %tmp8315, i64 1
+  %tmp8317 = getelementptr inbounds float* %tmp8316, i64 1
+  %tmp8318 = getelementptr inbounds float* %tmp8317, i64 1
+  %tmp8319 = getelementptr inbounds float* %tmp8318, i64 1
+  %tmp8320 = getelementptr inbounds float* %tmp8319, i64 1
+  %tmp8321 = getelementptr inbounds float* %tmp8320, i64 1
+  %tmp8322 = getelementptr inbounds float* %tmp8321, i64 1
+  %tmp8323 = getelementptr inbounds float* %tmp8322, i64 1
+  %tmp8324 = getelementptr inbounds float* %tmp8323, i64 1
+  %tmp8325 = getelementptr inbounds float* %tmp8324, i64 1
+  %tmp8326 = getelementptr inbounds float* %tmp8325, i64 1
+  %tmp8327 = getelementptr inbounds float* %tmp8326, i64 1
+  %tmp8328 = getelementptr inbounds float* %tmp8327, i64 1
+  %tmp8329 = getelementptr inbounds float* %tmp8328, i64 1
+  %tmp8330 = getelementptr inbounds float* %tmp8329, i64 1
+  %tmp8331 = getelementptr inbounds float* %tmp8330, i64 1
+  %tmp8332 = getelementptr inbounds float* %tmp8331, i64 1
+  %tmp8333 = getelementptr inbounds float* %tmp8332, i64 1
+  %tmp8334 = getelementptr inbounds float* %tmp8333, i64 1
+  %tmp8335 = getelementptr inbounds float* %tmp8334, i64 1
+  %tmp8336 = getelementptr inbounds float* %tmp8335, i64 1
+  %tmp8337 = getelementptr inbounds float* %tmp8336, i64 1
+  %tmp8338 = getelementptr inbounds float* %tmp8337, i64 1
+  %tmp8339 = getelementptr inbounds float* %tmp8338, i64 1
+  %tmp8340 = getelementptr inbounds float* %tmp8339, i64 1
+  %tmp8341 = getelementptr inbounds float* %tmp8340, i64 1
+  %tmp8342 = getelementptr inbounds float* %tmp8341, i64 1
+  %tmp8343 = getelementptr inbounds float* %tmp8342, i64 1
+  %tmp8344 = getelementptr inbounds float* %tmp8343, i64 1
+  %tmp8345 = getelementptr inbounds float* %tmp8344, i64 1
+  %tmp8346 = getelementptr inbounds float* %tmp8345, i64 1
+  %tmp8347 = getelementptr inbounds float* %tmp8346, i64 1
+  %tmp8348 = getelementptr inbounds float* %tmp8347, i64 1
+  %tmp8349 = getelementptr inbounds float* %tmp8348, i64 1
+  %tmp8350 = getelementptr inbounds float* %tmp8349, i64 1
+  %tmp8351 = getelementptr inbounds float* %tmp8350, i64 1
+  %tmp8352 = getelementptr inbounds float* %tmp8351, i64 1
+  %tmp8353 = getelementptr inbounds float* %tmp8352, i64 1
+  %tmp8354 = getelementptr inbounds float* %tmp8353, i64 1
+  %tmp8355 = getelementptr inbounds float* %tmp8354, i64 1
+  %tmp8356 = getelementptr inbounds float* %tmp8355, i64 1
+  %tmp8357 = getelementptr inbounds float* %tmp8356, i64 1
+  %tmp8358 = getelementptr inbounds float* %tmp8357, i64 1
+  %tmp8359 = getelementptr inbounds float* %tmp8358, i64 1
+  %tmp8360 = getelementptr inbounds float* %tmp8359, i64 1
+  %tmp8361 = getelementptr inbounds float* %tmp8360, i64 1
+  %tmp8362 = getelementptr inbounds float* %tmp8361, i64 1
+  %tmp8363 = getelementptr inbounds float* %tmp8362, i64 1
+  %tmp8364 = getelementptr inbounds float* %tmp8363, i64 1
+  %tmp8365 = getelementptr inbounds float* %tmp8364, i64 1
+  %tmp8366 = getelementptr inbounds float* %tmp8365, i64 1
+  %tmp8367 = getelementptr inbounds float* %tmp8366, i64 1
+  %tmp8368 = getelementptr inbounds float* %tmp8367, i64 1
+  %tmp8369 = getelementptr inbounds float* %tmp8368, i64 1
+  %tmp8370 = getelementptr inbounds float* %tmp8369, i64 1
+  %tmp8371 = getelementptr inbounds float* %tmp8370, i64 1
+  %tmp8372 = getelementptr inbounds float* %tmp8371, i64 1
+  %tmp8373 = getelementptr inbounds float* %tmp8372, i64 1
+  %tmp8374 = getelementptr inbounds float* %tmp8373, i64 1
+  %tmp8375 = getelementptr inbounds float* %tmp8374, i64 1
+  %tmp8376 = getelementptr inbounds float* %tmp8375, i64 1
+  %tmp8377 = getelementptr inbounds float* %tmp8376, i64 1
+  %tmp8378 = getelementptr inbounds float* %tmp8377, i64 1
+  %tmp8379 = getelementptr inbounds float* %tmp8378, i64 1
+  %tmp8380 = getelementptr inbounds float* %tmp8379, i64 1
+  %tmp8381 = getelementptr inbounds float* %tmp8380, i64 1
+  %tmp8382 = getelementptr inbounds float* %tmp8381, i64 1
+  %tmp8383 = getelementptr inbounds float* %tmp8382, i64 1
+  %tmp8384 = getelementptr inbounds float* %tmp8383, i64 1
+  %tmp8385 = getelementptr inbounds float* %tmp8384, i64 1
+  %tmp8386 = getelementptr inbounds float* %tmp8385, i64 1
+  %tmp8387 = getelementptr inbounds float* %tmp8386, i64 1
+  %tmp8388 = getelementptr inbounds float* %tmp8387, i64 1
+  %tmp8389 = getelementptr inbounds float* %tmp8388, i64 1
+  %tmp8390 = getelementptr inbounds float* %tmp8389, i64 1
+  %tmp8391 = getelementptr inbounds float* %tmp8390, i64 1
+  %tmp8392 = getelementptr inbounds float* %tmp8391, i64 1
+  %tmp8393 = getelementptr inbounds float* %tmp8392, i64 1
+  %tmp8394 = getelementptr inbounds float* %tmp8393, i64 1
+  %tmp8395 = getelementptr inbounds float* %tmp8394, i64 1
+  %tmp8396 = getelementptr inbounds float* %tmp8395, i64 1
+  %tmp8397 = getelementptr inbounds float* %tmp8396, i64 1
+  %tmp8398 = getelementptr inbounds float* %tmp8397, i64 1
+  %tmp8399 = getelementptr inbounds float* %tmp8398, i64 1
+  %tmp8400 = getelementptr inbounds float* %tmp8399, i64 1
+  %tmp8401 = getelementptr inbounds float* %tmp8400, i64 1
+  %tmp8402 = getelementptr inbounds float* %tmp8401, i64 1
+  %tmp8403 = getelementptr inbounds float* %tmp8402, i64 1
+  %tmp8404 = getelementptr inbounds float* %tmp8403, i64 1
+  %tmp8405 = getelementptr inbounds float* %tmp8404, i64 1
+  %tmp8406 = getelementptr inbounds float* %tmp8405, i64 1
+  %tmp8407 = getelementptr inbounds float* %tmp8406, i64 1
+  %tmp8408 = getelementptr inbounds float* %tmp8407, i64 1
+  %tmp8409 = getelementptr inbounds float* %tmp8408, i64 1
+  %tmp8410 = getelementptr inbounds float* %tmp8409, i64 1
+  %tmp8411 = getelementptr inbounds float* %tmp8410, i64 1
+  %tmp8412 = getelementptr inbounds float* %tmp8411, i64 1
+  %tmp8413 = getelementptr inbounds float* %tmp8412, i64 1
+  %tmp8414 = getelementptr inbounds float* %tmp8413, i64 1
+  %tmp8415 = getelementptr inbounds float* %tmp8414, i64 1
+  %tmp8416 = getelementptr inbounds float* %tmp8415, i64 1
+  %tmp8417 = getelementptr inbounds float* %tmp8416, i64 1
+  %tmp8418 = getelementptr inbounds float* %tmp8417, i64 1
+  %tmp8419 = getelementptr inbounds float* %tmp8418, i64 1
+  %tmp8420 = getelementptr inbounds float* %tmp8419, i64 1
+  %tmp8421 = getelementptr inbounds float* %tmp8420, i64 1
+  %tmp8422 = getelementptr inbounds float* %tmp8421, i64 1
+  %tmp8423 = getelementptr inbounds float* %tmp8422, i64 1
+  %tmp8424 = getelementptr inbounds float* %tmp8423, i64 1
+  %tmp8425 = getelementptr inbounds float* %tmp8424, i64 1
+  %tmp8426 = getelementptr inbounds float* %tmp8425, i64 1
+  %tmp8427 = getelementptr inbounds float* %tmp8426, i64 1
+  %tmp8428 = getelementptr inbounds float* %tmp8427, i64 1
+  %tmp8429 = getelementptr inbounds float* %tmp8428, i64 1
+  %tmp8430 = getelementptr inbounds float* %tmp8429, i64 1
+  %tmp8431 = getelementptr inbounds float* %tmp8430, i64 1
+  %tmp8432 = getelementptr inbounds float* %tmp8431, i64 1
+  %tmp8433 = getelementptr inbounds float* %tmp8432, i64 1
+  %tmp8434 = getelementptr inbounds float* %tmp8433, i64 1
+  %tmp8435 = getelementptr inbounds float* %tmp8434, i64 1
+  %tmp8436 = getelementptr inbounds float* %tmp8435, i64 1
+  %tmp8437 = getelementptr inbounds float* %tmp8436, i64 1
+  %tmp8438 = getelementptr inbounds float* %tmp8437, i64 1
+  %tmp8439 = getelementptr inbounds float* %tmp8438, i64 1
+  %tmp8440 = getelementptr inbounds float* %tmp8439, i64 1
+  %tmp8441 = getelementptr inbounds float* %tmp8440, i64 1
+  %tmp8442 = getelementptr inbounds float* %tmp8441, i64 1
+  %tmp8443 = getelementptr inbounds float* %tmp8442, i64 1
+  %tmp8444 = getelementptr inbounds float* %tmp8443, i64 1
+  %tmp8445 = getelementptr inbounds float* %tmp8444, i64 1
+  %tmp8446 = getelementptr inbounds float* %tmp8445, i64 1
+  %tmp8447 = getelementptr inbounds float* %tmp8446, i64 1
+  %tmp8448 = getelementptr inbounds float* %tmp8447, i64 1
+  %tmp8449 = getelementptr inbounds float* %tmp8448, i64 1
+  %tmp8450 = getelementptr inbounds float* %tmp8449, i64 1
+  %tmp8451 = getelementptr inbounds float* %tmp8450, i64 1
+  %tmp8452 = getelementptr inbounds float* %tmp8451, i64 1
+  %tmp8453 = getelementptr inbounds float* %tmp8452, i64 1
+  %tmp8454 = getelementptr inbounds float* %tmp8453, i64 1
+  %tmp8455 = getelementptr inbounds float* %tmp8454, i64 1
+  %tmp8456 = getelementptr inbounds float* %tmp8455, i64 1
+  %tmp8457 = getelementptr inbounds float* %tmp8456, i64 1
+  %tmp8458 = getelementptr inbounds float* %tmp8457, i64 1
+  %tmp8459 = getelementptr inbounds float* %tmp8458, i64 1
+  %tmp8460 = getelementptr inbounds float* %tmp8459, i64 1
+  %tmp8461 = getelementptr inbounds float* %tmp8460, i64 1
+  %tmp8462 = getelementptr inbounds float* %tmp8461, i64 1
+  %tmp8463 = getelementptr inbounds float* %tmp8462, i64 1
+  %tmp8464 = getelementptr inbounds float* %tmp8463, i64 1
+  %tmp8465 = getelementptr inbounds float* %tmp8464, i64 1
+  %tmp8466 = getelementptr inbounds float* %tmp8465, i64 1
+  %tmp8467 = getelementptr inbounds float* %tmp8466, i64 1
+  %tmp8468 = getelementptr inbounds float* %tmp8467, i64 1
+  %tmp8469 = getelementptr inbounds float* %tmp8468, i64 1
+  %tmp8470 = getelementptr inbounds float* %tmp8469, i64 1
+  %tmp8471 = getelementptr inbounds float* %tmp8470, i64 1
+  %tmp8472 = getelementptr inbounds float* %tmp8471, i64 1
+  %tmp8473 = getelementptr inbounds float* %tmp8472, i64 1
+  %tmp8474 = getelementptr inbounds float* %tmp8473, i64 1
+  %tmp8475 = getelementptr inbounds float* %tmp8474, i64 1
+  %tmp8476 = getelementptr inbounds float* %tmp8475, i64 1
+  %tmp8477 = getelementptr inbounds float* %tmp8476, i64 1
+  %tmp8478 = getelementptr inbounds float* %tmp8477, i64 1
+  %tmp8479 = getelementptr inbounds float* %tmp8478, i64 1
+  %tmp8480 = getelementptr inbounds float* %tmp8479, i64 1
+  %tmp8481 = getelementptr inbounds float* %tmp8480, i64 1
+  %tmp8482 = getelementptr inbounds float* %tmp8481, i64 1
+  %tmp8483 = getelementptr inbounds float* %tmp8482, i64 1
+  %tmp8484 = getelementptr inbounds float* %tmp8483, i64 1
+  %tmp8485 = getelementptr inbounds float* %tmp8484, i64 1
+  %tmp8486 = getelementptr inbounds float* %tmp8485, i64 1
+  %tmp8487 = getelementptr inbounds float* %tmp8486, i64 1
+  %tmp8488 = getelementptr inbounds float* %tmp8487, i64 1
+  %tmp8489 = getelementptr inbounds float* %tmp8488, i64 1
+  %tmp8490 = getelementptr inbounds float* %tmp8489, i64 1
+  %tmp8491 = getelementptr inbounds float* %tmp8490, i64 1
+  %tmp8492 = getelementptr inbounds float* %tmp8491, i64 1
+  %tmp8493 = getelementptr inbounds float* %tmp8492, i64 1
+  %tmp8494 = getelementptr inbounds float* %tmp8493, i64 1
+  %tmp8495 = getelementptr inbounds float* %tmp8494, i64 1
+  %tmp8496 = getelementptr inbounds float* %tmp8495, i64 1
+  %tmp8497 = getelementptr inbounds float* %tmp8496, i64 1
+  %tmp8498 = getelementptr inbounds float* %tmp8497, i64 1
+  %tmp8499 = getelementptr inbounds float* %tmp8498, i64 1
+  %tmp8500 = getelementptr inbounds float* %tmp8499, i64 1
+  %tmp8501 = getelementptr inbounds float* %tmp8500, i64 1
+  %tmp8502 = getelementptr inbounds float* %tmp8501, i64 1
+  %tmp8503 = getelementptr inbounds float* %tmp8502, i64 1
+  %tmp8504 = getelementptr inbounds float* %tmp8503, i64 1
+  %tmp8505 = getelementptr inbounds float* %tmp8504, i64 1
+  %tmp8506 = getelementptr inbounds float* %tmp8505, i64 1
+  %tmp8507 = getelementptr inbounds float* %tmp8506, i64 1
+  %tmp8508 = getelementptr inbounds float* %tmp8507, i64 1
+  %tmp8509 = getelementptr inbounds float* %tmp8508, i64 1
+  %tmp8510 = getelementptr inbounds float* %tmp8509, i64 1
+  %tmp8511 = getelementptr inbounds float* %tmp8510, i64 1
+  %tmp8512 = getelementptr inbounds float* %tmp8511, i64 1
+  %tmp8513 = getelementptr inbounds float* %tmp8512, i64 1
+  %tmp8514 = getelementptr inbounds float* %tmp8513, i64 1
+  %tmp8515 = getelementptr inbounds float* %tmp8514, i64 1
+  %tmp8516 = getelementptr inbounds float* %tmp8515, i64 1
+  %tmp8517 = getelementptr inbounds float* %tmp8516, i64 1
+  %tmp8518 = getelementptr inbounds float* %tmp8517, i64 1
+  %tmp8519 = getelementptr inbounds float* %tmp8518, i64 1
+  %tmp8520 = getelementptr inbounds float* %tmp8519, i64 1
+  %tmp8521 = getelementptr inbounds float* %tmp8520, i64 1
+  %tmp8522 = getelementptr inbounds float* %tmp8521, i64 1
+  %tmp8523 = getelementptr inbounds float* %tmp8522, i64 1
+  %tmp8524 = getelementptr inbounds float* %tmp8523, i64 1
+  %tmp8525 = getelementptr inbounds float* %tmp8524, i64 1
+  %tmp8526 = getelementptr inbounds float* %tmp8525, i64 1
+  %tmp8527 = getelementptr inbounds float* %tmp8526, i64 1
+  %tmp8528 = getelementptr inbounds float* %tmp8527, i64 1
+  %tmp8529 = getelementptr inbounds float* %tmp8528, i64 1
+  %tmp8530 = getelementptr inbounds float* %tmp8529, i64 1
+  %tmp8531 = getelementptr inbounds float* %tmp8530, i64 1
+  %tmp8532 = getelementptr inbounds float* %tmp8531, i64 1
+  %tmp8533 = getelementptr inbounds float* %tmp8532, i64 1
+  %tmp8534 = getelementptr inbounds float* %tmp8533, i64 1
+  %tmp8535 = getelementptr inbounds float* %tmp8534, i64 1
+  %tmp8536 = getelementptr inbounds float* %tmp8535, i64 1
+  %tmp8537 = getelementptr inbounds float* %tmp8536, i64 1
+  %tmp8538 = getelementptr inbounds float* %tmp8537, i64 1
+  %tmp8539 = getelementptr inbounds float* %tmp8538, i64 1
+  %tmp8540 = getelementptr inbounds float* %tmp8539, i64 1
+  %tmp8541 = getelementptr inbounds float* %tmp8540, i64 1
+  %tmp8542 = getelementptr inbounds float* %tmp8541, i64 1
+  %tmp8543 = getelementptr inbounds float* %tmp8542, i64 1
+  %tmp8544 = getelementptr inbounds float* %tmp8543, i64 1
+  %tmp8545 = getelementptr inbounds float* %tmp8544, i64 1
+  %tmp8546 = getelementptr inbounds float* %tmp8545, i64 1
+  %tmp8547 = getelementptr inbounds float* %tmp8546, i64 1
+  %tmp8548 = getelementptr inbounds float* %tmp8547, i64 1
+  %tmp8549 = getelementptr inbounds float* %tmp8548, i64 1
+  %tmp8550 = getelementptr inbounds float* %tmp8549, i64 1
+  %tmp8551 = getelementptr inbounds float* %tmp8550, i64 1
+  %tmp8552 = getelementptr inbounds float* %tmp8551, i64 1
+  %tmp8553 = getelementptr inbounds float* %tmp8552, i64 1
+  %tmp8554 = getelementptr inbounds float* %tmp8553, i64 1
+  %tmp8555 = getelementptr inbounds float* %tmp8554, i64 1
+  %tmp8556 = getelementptr inbounds float* %tmp8555, i64 1
+  %tmp8557 = getelementptr inbounds float* %tmp8556, i64 1
+  %tmp8558 = getelementptr inbounds float* %tmp8557, i64 1
+  %tmp8559 = getelementptr inbounds float* %tmp8558, i64 1
+  %tmp8560 = getelementptr inbounds float* %tmp8559, i64 1
+  %tmp8561 = getelementptr inbounds float* %tmp8560, i64 1
+  %tmp8562 = getelementptr inbounds float* %tmp8561, i64 1
+  %tmp8563 = getelementptr inbounds float* %tmp8562, i64 1
+  %tmp8564 = getelementptr inbounds float* %tmp8563, i64 1
+  %tmp8565 = getelementptr inbounds float* %tmp8564, i64 1
+  %tmp8566 = getelementptr inbounds float* %tmp8565, i64 1
+  %tmp8567 = getelementptr inbounds float* %tmp8566, i64 1
+  %tmp8568 = getelementptr inbounds float* %tmp8567, i64 1
+  %tmp8569 = getelementptr inbounds float* %tmp8568, i64 1
+  %tmp8570 = getelementptr inbounds float* %tmp8569, i64 1
+  %tmp8571 = getelementptr inbounds float* %tmp8570, i64 1
+  %tmp8572 = getelementptr inbounds float* %tmp8571, i64 1
+  %tmp8573 = getelementptr inbounds float* %tmp8572, i64 1
+  %tmp8574 = getelementptr inbounds float* %tmp8573, i64 1
+  %tmp8575 = getelementptr inbounds float* %tmp8574, i64 1
+  %tmp8576 = getelementptr inbounds float* %tmp8575, i64 1
+  %tmp8577 = getelementptr inbounds float* %tmp8576, i64 1
+  %tmp8578 = getelementptr inbounds float* %tmp8577, i64 1
+  %tmp8579 = getelementptr inbounds float* %tmp8578, i64 1
+  %tmp8580 = getelementptr inbounds float* %tmp8579, i64 1
+  %tmp8581 = getelementptr inbounds float* %tmp8580, i64 1
+  %tmp8582 = getelementptr inbounds float* %tmp8581, i64 1
+  %tmp8583 = getelementptr inbounds float* %tmp8582, i64 1
+  %tmp8584 = getelementptr inbounds float* %tmp8583, i64 1
+  %tmp8585 = getelementptr inbounds float* %tmp8584, i64 1
+  %tmp8586 = getelementptr inbounds float* %tmp8585, i64 1
+  %tmp8587 = getelementptr inbounds float* %tmp8586, i64 1
+  %tmp8588 = getelementptr inbounds float* %tmp8587, i64 1
+  %tmp8589 = getelementptr inbounds float* %tmp8588, i64 1
+  %tmp8590 = getelementptr inbounds float* %tmp8589, i64 1
+  %tmp8591 = getelementptr inbounds float* %tmp8590, i64 1
+  %tmp8592 = getelementptr inbounds float* %tmp8591, i64 1
+  %tmp8593 = getelementptr inbounds float* %tmp8592, i64 1
+  %tmp8594 = getelementptr inbounds float* %tmp8593, i64 1
+  %tmp8595 = getelementptr inbounds float* %tmp8594, i64 1
+  %tmp8596 = getelementptr inbounds float* %tmp8595, i64 1
+  %tmp8597 = getelementptr inbounds float* %tmp8596, i64 1
+  %tmp8598 = getelementptr inbounds float* %tmp8597, i64 1
+  %tmp8599 = getelementptr inbounds float* %tmp8598, i64 1
+  %tmp8600 = getelementptr inbounds float* %tmp8599, i64 1
+  %tmp8601 = getelementptr inbounds float* %tmp8600, i64 1
+  %tmp8602 = getelementptr inbounds float* %tmp8601, i64 1
+  %tmp8603 = getelementptr inbounds float* %tmp8602, i64 1
+  %tmp8604 = getelementptr inbounds float* %tmp8603, i64 1
+  %tmp8605 = getelementptr inbounds float* %tmp8604, i64 1
+  %tmp8606 = getelementptr inbounds float* %tmp8605, i64 1
+  %tmp8607 = getelementptr inbounds float* %tmp8606, i64 1
+  %tmp8608 = getelementptr inbounds float* %tmp8607, i64 1
+  %tmp8609 = getelementptr inbounds float* %tmp8608, i64 1
+  %tmp8610 = getelementptr inbounds float* %tmp8609, i64 1
+  %tmp8611 = getelementptr inbounds float* %tmp8610, i64 1
+  %tmp8612 = getelementptr inbounds float* %tmp8611, i64 1
+  %tmp8613 = getelementptr inbounds float* %tmp8612, i64 1
+  %tmp8614 = getelementptr inbounds float* %tmp8613, i64 1
+  %tmp8615 = getelementptr inbounds float* %tmp8614, i64 1
+  %tmp8616 = getelementptr inbounds float* %tmp8615, i64 1
+  %tmp8617 = getelementptr inbounds float* %tmp8616, i64 1
+  %tmp8618 = getelementptr inbounds float* %tmp8617, i64 1
+  %tmp8619 = getelementptr inbounds float* %tmp8618, i64 1
+  %tmp8620 = getelementptr inbounds float* %tmp8619, i64 1
+  %tmp8621 = getelementptr inbounds float* %tmp8620, i64 1
+  %tmp8622 = getelementptr inbounds float* %tmp8621, i64 1
+  %tmp8623 = getelementptr inbounds float* %tmp8622, i64 1
+  %tmp8624 = getelementptr inbounds float* %tmp8623, i64 1
+  %tmp8625 = getelementptr inbounds float* %tmp8624, i64 1
+  %tmp8626 = getelementptr inbounds float* %tmp8625, i64 1
+  %tmp8627 = getelementptr inbounds float* %tmp8626, i64 1
+  %tmp8628 = getelementptr inbounds float* %tmp8627, i64 1
+  %tmp8629 = getelementptr inbounds float* %tmp8628, i64 1
+  %tmp8630 = getelementptr inbounds float* %tmp8629, i64 1
+  %tmp8631 = getelementptr inbounds float* %tmp8630, i64 1
+  %tmp8632 = getelementptr inbounds float* %tmp8631, i64 1
+  %tmp8633 = getelementptr inbounds float* %tmp8632, i64 1
+  %tmp8634 = getelementptr inbounds float* %tmp8633, i64 1
+  %tmp8635 = getelementptr inbounds float* %tmp8634, i64 1
+  %tmp8636 = getelementptr inbounds float* %tmp8635, i64 1
+  %tmp8637 = getelementptr inbounds float* %tmp8636, i64 1
+  %tmp8638 = getelementptr inbounds float* %tmp8637, i64 1
+  %tmp8639 = getelementptr inbounds float* %tmp8638, i64 1
+  %tmp8640 = getelementptr inbounds float* %tmp8639, i64 1
+  %tmp8641 = getelementptr inbounds float* %tmp8640, i64 1
+  %tmp8642 = getelementptr inbounds float* %tmp8641, i64 1
+  %tmp8643 = getelementptr inbounds float* %tmp8642, i64 1
+  %tmp8644 = getelementptr inbounds float* %tmp8643, i64 1
+  %tmp8645 = getelementptr inbounds float* %tmp8644, i64 1
+  %tmp8646 = getelementptr inbounds float* %tmp8645, i64 1
+  %tmp8647 = getelementptr inbounds float* %tmp8646, i64 1
+  %tmp8648 = getelementptr inbounds float* %tmp8647, i64 1
+  %tmp8649 = getelementptr inbounds float* %tmp8648, i64 1
+  %tmp8650 = getelementptr inbounds float* %tmp8649, i64 1
+  %tmp8651 = getelementptr inbounds float* %tmp8650, i64 1
+  %tmp8652 = getelementptr inbounds float* %tmp8651, i64 1
+  %tmp8653 = getelementptr inbounds float* %tmp8652, i64 1
+  %tmp8654 = getelementptr inbounds float* %tmp8653, i64 1
+  %tmp8655 = getelementptr inbounds float* %tmp8654, i64 1
+  %tmp8656 = getelementptr inbounds float* %tmp8655, i64 1
+  %tmp8657 = getelementptr inbounds float* %tmp8656, i64 1
+  %tmp8658 = getelementptr inbounds float* %tmp8657, i64 1
+  %tmp8659 = getelementptr inbounds float* %tmp8658, i64 1
+  %tmp8660 = getelementptr inbounds float* %tmp8659, i64 1
+  %tmp8661 = getelementptr inbounds float* %tmp8660, i64 1
+  %tmp8662 = getelementptr inbounds float* %tmp8661, i64 1
+  %tmp8663 = getelementptr inbounds float* %tmp8662, i64 1
+  %tmp8664 = getelementptr inbounds float* %tmp8663, i64 1
+  %tmp8665 = getelementptr inbounds float* %tmp8664, i64 1
+  %tmp8666 = getelementptr inbounds float* %tmp8665, i64 1
+  %tmp8667 = getelementptr inbounds float* %tmp8666, i64 1
+  %tmp8668 = getelementptr inbounds float* %tmp8667, i64 1
+  %tmp8669 = getelementptr inbounds float* %tmp8668, i64 1
+  %tmp8670 = getelementptr inbounds float* %tmp8669, i64 1
+  %tmp8671 = getelementptr inbounds float* %tmp8670, i64 1
+  %tmp8672 = getelementptr inbounds float* %tmp8671, i64 1
+  %tmp8673 = getelementptr inbounds float* %tmp8672, i64 1
+  %tmp8674 = getelementptr inbounds float* %tmp8673, i64 1
+  %tmp8675 = getelementptr inbounds float* %tmp8674, i64 1
+  %tmp8676 = getelementptr inbounds float* %tmp8675, i64 1
+  %tmp8677 = getelementptr inbounds float* %tmp8676, i64 1
+  %tmp8678 = getelementptr inbounds float* %tmp8677, i64 1
+  %tmp8679 = getelementptr inbounds float* %tmp8678, i64 1
+  %tmp8680 = getelementptr inbounds float* %tmp8679, i64 1
+  %tmp8681 = getelementptr inbounds float* %tmp8680, i64 1
+  %tmp8682 = getelementptr inbounds float* %tmp8681, i64 1
+  %tmp8683 = getelementptr inbounds float* %tmp8682, i64 1
+  %tmp8684 = getelementptr inbounds float* %tmp8683, i64 1
+  %tmp8685 = getelementptr inbounds float* %tmp8684, i64 1
+  %tmp8686 = getelementptr inbounds float* %tmp8685, i64 1
+  %tmp8687 = getelementptr inbounds float* %tmp8686, i64 1
+  %tmp8688 = getelementptr inbounds float* %tmp8687, i64 1
+  %tmp8689 = getelementptr inbounds float* %tmp8688, i64 1
+  %tmp8690 = getelementptr inbounds float* %tmp8689, i64 1
+  %tmp8691 = getelementptr inbounds float* %tmp8690, i64 1
+  %tmp8692 = getelementptr inbounds float* %tmp8691, i64 1
+  %tmp8693 = getelementptr inbounds float* %tmp8692, i64 1
+  %tmp8694 = getelementptr inbounds float* %tmp8693, i64 1
+  %tmp8695 = getelementptr inbounds float* %tmp8694, i64 1
+  %tmp8696 = getelementptr inbounds float* %tmp8695, i64 1
+  %tmp8697 = getelementptr inbounds float* %tmp8696, i64 1
+  %tmp8698 = getelementptr inbounds float* %tmp8697, i64 1
+  %tmp8699 = getelementptr inbounds float* %tmp8698, i64 1
+  %tmp8700 = getelementptr inbounds float* %tmp8699, i64 1
+  %tmp8701 = getelementptr inbounds float* %tmp8700, i64 1
+  %tmp8702 = getelementptr inbounds float* %tmp8701, i64 1
+  %tmp8703 = getelementptr inbounds float* %tmp8702, i64 1
+  %tmp8704 = getelementptr inbounds float* %tmp8703, i64 1
+  %tmp8705 = getelementptr inbounds float* %tmp8704, i64 1
+  %tmp8706 = getelementptr inbounds float* %tmp8705, i64 1
+  %tmp8707 = getelementptr inbounds float* %tmp8706, i64 1
+  %tmp8708 = getelementptr inbounds float* %tmp8707, i64 1
+  %tmp8709 = getelementptr inbounds float* %tmp8708, i64 1
+  %tmp8710 = getelementptr inbounds float* %tmp8709, i64 1
+  %tmp8711 = getelementptr inbounds float* %tmp8710, i64 1
+  %tmp8712 = getelementptr inbounds float* %tmp8711, i64 1
+  %tmp8713 = getelementptr inbounds float* %tmp8712, i64 1
+  %tmp8714 = getelementptr inbounds float* %tmp8713, i64 1
+  %tmp8715 = getelementptr inbounds float* %tmp8714, i64 1
+  %tmp8716 = getelementptr inbounds float* %tmp8715, i64 1
+  %tmp8717 = getelementptr inbounds float* %tmp8716, i64 1
+  %tmp8718 = getelementptr inbounds float* %tmp8717, i64 1
+  %tmp8719 = getelementptr inbounds float* %tmp8718, i64 1
+  %tmp8720 = getelementptr inbounds float* %tmp8719, i64 1
+  %tmp8721 = getelementptr inbounds float* %tmp8720, i64 1
+  %tmp8722 = getelementptr inbounds float* %tmp8721, i64 1
+  %tmp8723 = getelementptr inbounds float* %tmp8722, i64 1
+  %tmp8724 = getelementptr inbounds float* %tmp8723, i64 1
+  %tmp8725 = getelementptr inbounds float* %tmp8724, i64 1
+  %tmp8726 = getelementptr inbounds float* %tmp8725, i64 1
+  %tmp8727 = getelementptr inbounds float* %tmp8726, i64 1
+  %tmp8728 = getelementptr inbounds float* %tmp8727, i64 1
+  %tmp8729 = getelementptr inbounds float* %tmp8728, i64 1
+  %tmp8730 = getelementptr inbounds float* %tmp8729, i64 1
+  %tmp8731 = getelementptr inbounds float* %tmp8730, i64 1
+  %tmp8732 = getelementptr inbounds float* %tmp8731, i64 1
+  %tmp8733 = getelementptr inbounds float* %tmp8732, i64 1
+  %tmp8734 = getelementptr inbounds float* %tmp8733, i64 1
+  %tmp8735 = getelementptr inbounds float* %tmp8734, i64 1
+  %tmp8736 = getelementptr inbounds float* %tmp8735, i64 1
+  %tmp8737 = getelementptr inbounds float* %tmp8736, i64 1
+  %tmp8738 = getelementptr inbounds float* %tmp8737, i64 1
+  %tmp8739 = getelementptr inbounds float* %tmp8738, i64 1
+  %tmp8740 = getelementptr inbounds float* %tmp8739, i64 1
+  %tmp8741 = getelementptr inbounds float* %tmp8740, i64 1
+  %tmp8742 = getelementptr inbounds float* %tmp8741, i64 1
+  %tmp8743 = getelementptr inbounds float* %tmp8742, i64 1
+  %tmp8744 = getelementptr inbounds float* %tmp8743, i64 1
+  %tmp8745 = getelementptr inbounds float* %tmp8744, i64 1
+  %tmp8746 = getelementptr inbounds float* %tmp8745, i64 1
+  %tmp8747 = getelementptr inbounds float* %tmp8746, i64 1
+  %tmp8748 = getelementptr inbounds float* %tmp8747, i64 1
+  %tmp8749 = getelementptr inbounds float* %tmp8748, i64 1
+  %tmp8750 = getelementptr inbounds float* %tmp8749, i64 1
+  %tmp8751 = getelementptr inbounds float* %tmp8750, i64 1
+  %tmp8752 = getelementptr inbounds float* %tmp8751, i64 1
+  %tmp8753 = getelementptr inbounds float* %tmp8752, i64 1
+  %tmp8754 = getelementptr inbounds float* %tmp8753, i64 1
+  %tmp8755 = getelementptr inbounds float* %tmp8754, i64 1
+  %tmp8756 = getelementptr inbounds float* %tmp8755, i64 1
+  %tmp8757 = getelementptr inbounds float* %tmp8756, i64 1
+  %tmp8758 = getelementptr inbounds float* %tmp8757, i64 1
+  %tmp8759 = getelementptr inbounds float* %tmp8758, i64 1
+  %tmp8760 = getelementptr inbounds float* %tmp8759, i64 1
+  %tmp8761 = getelementptr inbounds float* %tmp8760, i64 1
+  %tmp8762 = getelementptr inbounds float* %tmp8761, i64 1
+  %tmp8763 = getelementptr inbounds float* %tmp8762, i64 1
+  %tmp8764 = getelementptr inbounds float* %tmp8763, i64 1
+  %tmp8765 = getelementptr inbounds float* %tmp8764, i64 1
+  %tmp8766 = getelementptr inbounds float* %tmp8765, i64 1
+  %tmp8767 = getelementptr inbounds float* %tmp8766, i64 1
+  %tmp8768 = getelementptr inbounds float* %tmp8767, i64 1
+  %tmp8769 = getelementptr inbounds float* %tmp8768, i64 1
+  %tmp8770 = getelementptr inbounds float* %tmp8769, i64 1
+  %tmp8771 = getelementptr inbounds float* %tmp8770, i64 1
+  %tmp8772 = getelementptr inbounds float* %tmp8771, i64 1
+  %tmp8773 = getelementptr inbounds float* %tmp8772, i64 1
+  %tmp8774 = getelementptr inbounds float* %tmp8773, i64 1
+  %tmp8775 = getelementptr inbounds float* %tmp8774, i64 1
+  %tmp8776 = getelementptr inbounds float* %tmp8775, i64 1
+  %tmp8777 = getelementptr inbounds float* %tmp8776, i64 1
+  %tmp8778 = getelementptr inbounds float* %tmp8777, i64 1
+  %tmp8779 = getelementptr inbounds float* %tmp8778, i64 1
+  %tmp8780 = getelementptr inbounds float* %tmp8779, i64 1
+  %tmp8781 = getelementptr inbounds float* %tmp8780, i64 1
+  %tmp8782 = getelementptr inbounds float* %tmp8781, i64 1
+  %tmp8783 = getelementptr inbounds float* %tmp8782, i64 1
+  %tmp8784 = getelementptr inbounds float* %tmp8783, i64 1
+  %tmp8785 = getelementptr inbounds float* %tmp8784, i64 1
+  %tmp8786 = getelementptr inbounds float* %tmp8785, i64 1
+  %tmp8787 = getelementptr inbounds float* %tmp8786, i64 1
+  %tmp8788 = getelementptr inbounds float* %tmp8787, i64 1
+  %tmp8789 = getelementptr inbounds float* %tmp8788, i64 1
+  %tmp8790 = getelementptr inbounds float* %tmp8789, i64 1
+  %tmp8791 = getelementptr inbounds float* %tmp8790, i64 1
+  %tmp8792 = getelementptr inbounds float* %tmp8791, i64 1
+  %tmp8793 = getelementptr inbounds float* %tmp8792, i64 1
+  %tmp8794 = getelementptr inbounds float* %tmp8793, i64 1
+  %tmp8795 = getelementptr inbounds float* %tmp8794, i64 1
+  %tmp8796 = getelementptr inbounds float* %tmp8795, i64 1
+  %tmp8797 = getelementptr inbounds float* %tmp8796, i64 1
+  %tmp8798 = getelementptr inbounds float* %tmp8797, i64 1
+  %tmp8799 = getelementptr inbounds float* %tmp8798, i64 1
+  %tmp8800 = getelementptr inbounds float* %tmp8799, i64 1
+  %tmp8801 = getelementptr inbounds float* %tmp8800, i64 1
+  %tmp8802 = getelementptr inbounds float* %tmp8801, i64 1
+  %tmp8803 = getelementptr inbounds float* %tmp8802, i64 1
+  %tmp8804 = getelementptr inbounds float* %tmp8803, i64 1
+  %tmp8805 = getelementptr inbounds float* %tmp8804, i64 1
+  %tmp8806 = getelementptr inbounds float* %tmp8805, i64 1
+  %tmp8807 = getelementptr inbounds float* %tmp8806, i64 1
+  %tmp8808 = getelementptr inbounds float* %tmp8807, i64 1
+  %tmp8809 = getelementptr inbounds float* %tmp8808, i64 1
+  %tmp8810 = getelementptr inbounds float* %tmp8809, i64 1
+  %tmp8811 = getelementptr inbounds float* %tmp8810, i64 1
+  %tmp8812 = getelementptr inbounds float* %tmp8811, i64 1
+  %tmp8813 = getelementptr inbounds float* %tmp8812, i64 1
+  %tmp8814 = getelementptr inbounds float* %tmp8813, i64 1
+  %tmp8815 = getelementptr inbounds float* %tmp8814, i64 1
+  %tmp8816 = getelementptr inbounds float* %tmp8815, i64 1
+  %tmp8817 = getelementptr inbounds float* %tmp8816, i64 1
+  %tmp8818 = getelementptr inbounds float* %tmp8817, i64 1
+  %tmp8819 = getelementptr inbounds float* %tmp8818, i64 1
+  %tmp8820 = getelementptr inbounds float* %tmp8819, i64 1
+  %tmp8821 = getelementptr inbounds float* %tmp8820, i64 1
+  %tmp8822 = getelementptr inbounds float* %tmp8821, i64 1
+  %tmp8823 = getelementptr inbounds float* %tmp8822, i64 1
+  %tmp8824 = getelementptr inbounds float* %tmp8823, i64 1
+  %tmp8825 = getelementptr inbounds float* %tmp8824, i64 1
+  %tmp8826 = getelementptr inbounds float* %tmp8825, i64 1
+  %tmp8827 = getelementptr inbounds float* %tmp8826, i64 1
+  %tmp8828 = getelementptr inbounds float* %tmp8827, i64 1
+  %tmp8829 = getelementptr inbounds float* %tmp8828, i64 1
+  %tmp8830 = getelementptr inbounds float* %tmp8829, i64 1
+  %tmp8831 = getelementptr inbounds float* %tmp8830, i64 1
+  %tmp8832 = getelementptr inbounds float* %tmp8831, i64 1
+  %tmp8833 = getelementptr inbounds float* %tmp8832, i64 1
+  %tmp8834 = getelementptr inbounds float* %tmp8833, i64 1
+  %tmp8835 = getelementptr inbounds float* %tmp8834, i64 1
+  %tmp8836 = getelementptr inbounds float* %tmp8835, i64 1
+  %tmp8837 = getelementptr inbounds float* %tmp8836, i64 1
+  %tmp8838 = getelementptr inbounds float* %tmp8837, i64 1
+  %tmp8839 = getelementptr inbounds float* %tmp8838, i64 1
+  %tmp8840 = getelementptr inbounds float* %tmp8839, i64 1
+  %tmp8841 = getelementptr inbounds float* %tmp8840, i64 1
+  %tmp8842 = getelementptr inbounds float* %tmp8841, i64 1
+  %tmp8843 = getelementptr inbounds float* %tmp8842, i64 1
+  %tmp8844 = getelementptr inbounds float* %tmp8843, i64 1
+  %tmp8845 = getelementptr inbounds float* %tmp8844, i64 1
+  %tmp8846 = getelementptr inbounds float* %tmp8845, i64 1
+  %tmp8847 = getelementptr inbounds float* %tmp8846, i64 1
+  %tmp8848 = getelementptr inbounds float* %tmp8847, i64 1
+  %tmp8849 = getelementptr inbounds float* %tmp8848, i64 1
+  %tmp8850 = getelementptr inbounds float* %tmp8849, i64 1
+  %tmp8851 = getelementptr inbounds float* %tmp8850, i64 1
+  %tmp8852 = getelementptr inbounds float* %tmp8851, i64 1
+  %tmp8853 = getelementptr inbounds float* %tmp8852, i64 1
+  %tmp8854 = getelementptr inbounds float* %tmp8853, i64 1
+  %tmp8855 = getelementptr inbounds float* %tmp8854, i64 1
+  %tmp8856 = getelementptr inbounds float* %tmp8855, i64 1
+  %tmp8857 = getelementptr inbounds float* %tmp8856, i64 1
+  %tmp8858 = getelementptr inbounds float* %tmp8857, i64 1
+  %tmp8859 = getelementptr inbounds float* %tmp8858, i64 1
+  %tmp8860 = getelementptr inbounds float* %tmp8859, i64 1
+  %tmp8861 = getelementptr inbounds float* %tmp8860, i64 1
+  %tmp8862 = getelementptr inbounds float* %tmp8861, i64 1
+  %tmp8863 = getelementptr inbounds float* %tmp8862, i64 1
+  %tmp8864 = getelementptr inbounds float* %tmp8863, i64 1
+  %tmp8865 = getelementptr inbounds float* %tmp8864, i64 1
+  %tmp8866 = getelementptr inbounds float* %tmp8865, i64 1
+  %tmp8867 = getelementptr inbounds float* %tmp8866, i64 1
+  %tmp8868 = getelementptr inbounds float* %tmp8867, i64 1
+  %tmp8869 = getelementptr inbounds float* %tmp8868, i64 1
+  %tmp8870 = getelementptr inbounds float* %tmp8869, i64 1
+  %tmp8871 = getelementptr inbounds float* %tmp8870, i64 1
+  %tmp8872 = getelementptr inbounds float* %tmp8871, i64 1
+  %tmp8873 = getelementptr inbounds float* %tmp8872, i64 1
+  %tmp8874 = getelementptr inbounds float* %tmp8873, i64 1
+  %tmp8875 = getelementptr inbounds float* %tmp8874, i64 1
+  %tmp8876 = getelementptr inbounds float* %tmp8875, i64 1
+  %tmp8877 = getelementptr inbounds float* %tmp8876, i64 1
+  %tmp8878 = getelementptr inbounds float* %tmp8877, i64 1
+  %tmp8879 = getelementptr inbounds float* %tmp8878, i64 1
+  %tmp8880 = getelementptr inbounds float* %tmp8879, i64 1
+  %tmp8881 = getelementptr inbounds float* %tmp8880, i64 1
+  %tmp8882 = getelementptr inbounds float* %tmp8881, i64 1
+  %tmp8883 = getelementptr inbounds float* %tmp8882, i64 1
+  %tmp8884 = getelementptr inbounds float* %tmp8883, i64 1
+  %tmp8885 = getelementptr inbounds float* %tmp8884, i64 1
+  %tmp8886 = getelementptr inbounds float* %tmp8885, i64 1
+  %tmp8887 = getelementptr inbounds float* %tmp8886, i64 1
+  %tmp8888 = getelementptr inbounds float* %tmp8887, i64 1
+  %tmp8889 = getelementptr inbounds float* %tmp8888, i64 1
+  %tmp8890 = getelementptr inbounds float* %tmp8889, i64 1
+  %tmp8891 = getelementptr inbounds float* %tmp8890, i64 1
+  %tmp8892 = getelementptr inbounds float* %tmp8891, i64 1
+  %tmp8893 = getelementptr inbounds float* %tmp8892, i64 1
+  %tmp8894 = getelementptr inbounds float* %tmp8893, i64 1
+  %tmp8895 = getelementptr inbounds float* %tmp8894, i64 1
+  %tmp8896 = getelementptr inbounds float* %tmp8895, i64 1
+  %tmp8897 = getelementptr inbounds float* %tmp8896, i64 1
+  %tmp8898 = getelementptr inbounds float* %tmp8897, i64 1
+  %tmp8899 = getelementptr inbounds float* %tmp8898, i64 1
+  %tmp8900 = getelementptr inbounds float* %tmp8899, i64 1
+  %tmp8901 = getelementptr inbounds float* %tmp8900, i64 1
+  %tmp8902 = getelementptr inbounds float* %tmp8901, i64 1
+  %tmp8903 = getelementptr inbounds float* %tmp8902, i64 1
+  %tmp8904 = getelementptr inbounds float* %tmp8903, i64 1
+  %tmp8905 = getelementptr inbounds float* %tmp8904, i64 1
+  %tmp8906 = getelementptr inbounds float* %tmp8905, i64 1
+  %tmp8907 = getelementptr inbounds float* %tmp8906, i64 1
+  %tmp8908 = getelementptr inbounds float* %tmp8907, i64 1
+  %tmp8909 = getelementptr inbounds float* %tmp8908, i64 1
+  %tmp8910 = getelementptr inbounds float* %tmp8909, i64 1
+  %tmp8911 = getelementptr inbounds float* %tmp8910, i64 1
+  %tmp8912 = getelementptr inbounds float* %tmp8911, i64 1
+  %tmp8913 = getelementptr inbounds float* %tmp8912, i64 1
+  %tmp8914 = getelementptr inbounds float* %tmp8913, i64 1
+  %tmp8915 = getelementptr inbounds float* %tmp8914, i64 1
+  %tmp8916 = getelementptr inbounds float* %tmp8915, i64 1
+  %tmp8917 = getelementptr inbounds float* %tmp8916, i64 1
+  %tmp8918 = getelementptr inbounds float* %tmp8917, i64 1
+  %tmp8919 = getelementptr inbounds float* %tmp8918, i64 1
+  %tmp8920 = getelementptr inbounds float* %tmp8919, i64 1
+  %tmp8921 = getelementptr inbounds float* %tmp8920, i64 1
+  %tmp8922 = getelementptr inbounds float* %tmp8921, i64 1
+  %tmp8923 = getelementptr inbounds float* %tmp8922, i64 1
+  %tmp8924 = getelementptr inbounds float* %tmp8923, i64 1
+  %tmp8925 = getelementptr inbounds float* %tmp8924, i64 1
+  %tmp8926 = getelementptr inbounds float* %tmp8925, i64 1
+  %tmp8927 = getelementptr inbounds float* %tmp8926, i64 1
+  %tmp8928 = getelementptr inbounds float* %tmp8927, i64 1
+  %tmp8929 = getelementptr inbounds float* %tmp8928, i64 1
+  %tmp8930 = getelementptr inbounds float* %tmp8929, i64 1
+  %tmp8931 = getelementptr inbounds float* %tmp8930, i64 1
+  %tmp8932 = getelementptr inbounds float* %tmp8931, i64 1
+  %tmp8933 = getelementptr inbounds float* %tmp8932, i64 1
+  %tmp8934 = getelementptr inbounds float* %tmp8933, i64 1
+  %tmp8935 = getelementptr inbounds float* %tmp8934, i64 1
+  %tmp8936 = getelementptr inbounds float* %tmp8935, i64 1
+  %tmp8937 = getelementptr inbounds float* %tmp8936, i64 1
+  %tmp8938 = getelementptr inbounds float* %tmp8937, i64 1
+  %tmp8939 = getelementptr inbounds float* %tmp8938, i64 1
+  %tmp8940 = getelementptr inbounds float* %tmp8939, i64 1
+  %tmp8941 = getelementptr inbounds float* %tmp8940, i64 1
+  %tmp8942 = getelementptr inbounds float* %tmp8941, i64 1
+  %tmp8943 = getelementptr inbounds float* %tmp8942, i64 1
+  %tmp8944 = getelementptr inbounds float* %tmp8943, i64 1
+  %tmp8945 = getelementptr inbounds float* %tmp8944, i64 1
+  %tmp8946 = getelementptr inbounds float* %tmp8945, i64 1
+  %tmp8947 = getelementptr inbounds float* %tmp8946, i64 1
+  %tmp8948 = getelementptr inbounds float* %tmp8947, i64 1
+  %tmp8949 = getelementptr inbounds float* %tmp8948, i64 1
+  %tmp8950 = getelementptr inbounds float* %tmp8949, i64 1
+  %tmp8951 = getelementptr inbounds float* %tmp8950, i64 1
+  %tmp8952 = getelementptr inbounds float* %tmp8951, i64 1
+  %tmp8953 = getelementptr inbounds float* %tmp8952, i64 1
+  %tmp8954 = getelementptr inbounds float* %tmp8953, i64 1
+  %tmp8955 = getelementptr inbounds float* %tmp8954, i64 1
+  %tmp8956 = getelementptr inbounds float* %tmp8955, i64 1
+  %tmp8957 = getelementptr inbounds float* %tmp8956, i64 1
+  %tmp8958 = getelementptr inbounds float* %tmp8957, i64 1
+  %tmp8959 = getelementptr inbounds float* %tmp8958, i64 1
+  %tmp8960 = getelementptr inbounds float* %tmp8959, i64 1
+  %tmp8961 = getelementptr inbounds float* %tmp8960, i64 1
+  %tmp8962 = getelementptr inbounds float* %tmp8961, i64 1
+  %tmp8963 = getelementptr inbounds float* %tmp8962, i64 1
+  %tmp8964 = getelementptr inbounds float* %tmp8963, i64 1
+  %tmp8965 = getelementptr inbounds float* %tmp8964, i64 1
+  %tmp8966 = getelementptr inbounds float* %tmp8965, i64 1
+  %tmp8967 = getelementptr inbounds float* %tmp8966, i64 1
+  %tmp8968 = getelementptr inbounds float* %tmp8967, i64 1
+  %tmp8969 = getelementptr inbounds float* %tmp8968, i64 1
+  %tmp8970 = getelementptr inbounds float* %tmp8969, i64 1
+  %tmp8971 = getelementptr inbounds float* %tmp8970, i64 1
+  %tmp8972 = getelementptr inbounds float* %tmp8971, i64 1
+  %tmp8973 = getelementptr inbounds float* %tmp8972, i64 1
+  %tmp8974 = getelementptr inbounds float* %tmp8973, i64 1
+  %tmp8975 = getelementptr inbounds float* %tmp8974, i64 1
+  %tmp8976 = getelementptr inbounds float* %tmp8975, i64 1
+  %tmp8977 = getelementptr inbounds float* %tmp8976, i64 1
+  %tmp8978 = getelementptr inbounds float* %tmp8977, i64 1
+  %tmp8979 = getelementptr inbounds float* %tmp8978, i64 1
+  %tmp8980 = getelementptr inbounds float* %tmp8979, i64 1
+  %tmp8981 = getelementptr inbounds float* %tmp8980, i64 1
+  %tmp8982 = getelementptr inbounds float* %tmp8981, i64 1
+  %tmp8983 = getelementptr inbounds float* %tmp8982, i64 1
+  %tmp8984 = getelementptr inbounds float* %tmp8983, i64 1
+  %tmp8985 = getelementptr inbounds float* %tmp8984, i64 1
+  %tmp8986 = getelementptr inbounds float* %tmp8985, i64 1
+  %tmp8987 = getelementptr inbounds float* %tmp8986, i64 1
+  %tmp8988 = getelementptr inbounds float* %tmp8987, i64 1
+  %tmp8989 = getelementptr inbounds float* %tmp8988, i64 1
+  %tmp8990 = getelementptr inbounds float* %tmp8989, i64 1
+  %tmp8991 = getelementptr inbounds float* %tmp8990, i64 1
+  %tmp8992 = getelementptr inbounds float* %tmp8991, i64 1
+  %tmp8993 = getelementptr inbounds float* %tmp8992, i64 1
+  %tmp8994 = getelementptr inbounds float* %tmp8993, i64 1
+  %tmp8995 = getelementptr inbounds float* %tmp8994, i64 1
+  %tmp8996 = getelementptr inbounds float* %tmp8995, i64 1
+  %tmp8997 = getelementptr inbounds float* %tmp8996, i64 1
+  %tmp8998 = getelementptr inbounds float* %tmp8997, i64 1
+  %tmp8999 = getelementptr inbounds float* %tmp8998, i64 1
+  %tmp9000 = getelementptr inbounds float* %tmp8999, i64 1
+  %tmp9001 = getelementptr inbounds float* %tmp9000, i64 1
+  %tmp9002 = getelementptr inbounds float* %tmp9001, i64 1
+  %tmp9003 = getelementptr inbounds float* %tmp9002, i64 1
+  %tmp9004 = getelementptr inbounds float* %tmp9003, i64 1
+  %tmp9005 = getelementptr inbounds float* %tmp9004, i64 1
+  %tmp9006 = getelementptr inbounds float* %tmp9005, i64 1
+  %tmp9007 = getelementptr inbounds float* %tmp9006, i64 1
+  %tmp9008 = getelementptr inbounds float* %tmp9007, i64 1
+  %tmp9009 = getelementptr inbounds float* %tmp9008, i64 1
+  %tmp9010 = getelementptr inbounds float* %tmp9009, i64 1
+  %tmp9011 = getelementptr inbounds float* %tmp9010, i64 1
+  %tmp9012 = getelementptr inbounds float* %tmp9011, i64 1
+  %tmp9013 = getelementptr inbounds float* %tmp9012, i64 1
+  %tmp9014 = getelementptr inbounds float* %tmp9013, i64 1
+  %tmp9015 = getelementptr inbounds float* %tmp9014, i64 1
+  %tmp9016 = getelementptr inbounds float* %tmp9015, i64 1
+  %tmp9017 = getelementptr inbounds float* %tmp9016, i64 1
+  %tmp9018 = getelementptr inbounds float* %tmp9017, i64 1
+  %tmp9019 = getelementptr inbounds float* %tmp9018, i64 1
+  %tmp9020 = getelementptr inbounds float* %tmp9019, i64 1
+  %tmp9021 = getelementptr inbounds float* %tmp9020, i64 1
+  %tmp9022 = getelementptr inbounds float* %tmp9021, i64 1
+  %tmp9023 = getelementptr inbounds float* %tmp9022, i64 1
+  %tmp9024 = getelementptr inbounds float* %tmp9023, i64 1
+  %tmp9025 = getelementptr inbounds float* %tmp9024, i64 1
+  %tmp9026 = getelementptr inbounds float* %tmp9025, i64 1
+  %tmp9027 = getelementptr inbounds float* %tmp9026, i64 1
+  %tmp9028 = getelementptr inbounds float* %tmp9027, i64 1
+  %tmp9029 = getelementptr inbounds float* %tmp9028, i64 1
+  %tmp9030 = getelementptr inbounds float* %tmp9029, i64 1
+  %tmp9031 = getelementptr inbounds float* %tmp9030, i64 1
+  %tmp9032 = getelementptr inbounds float* %tmp9031, i64 1
+  %tmp9033 = getelementptr inbounds float* %tmp9032, i64 1
+  %tmp9034 = getelementptr inbounds float* %tmp9033, i64 1
+  %tmp9035 = getelementptr inbounds float* %tmp9034, i64 1
+  %tmp9036 = getelementptr inbounds float* %tmp9035, i64 1
+  %tmp9037 = getelementptr inbounds float* %tmp9036, i64 1
+  %tmp9038 = getelementptr inbounds float* %tmp9037, i64 1
+  %tmp9039 = getelementptr inbounds float* %tmp9038, i64 1
+  %tmp9040 = getelementptr inbounds float* %tmp9039, i64 1
+  %tmp9041 = getelementptr inbounds float* %tmp9040, i64 1
+  %tmp9042 = getelementptr inbounds float* %tmp9041, i64 1
+  %tmp9043 = getelementptr inbounds float* %tmp9042, i64 1
+  %tmp9044 = getelementptr inbounds float* %tmp9043, i64 1
+  %tmp9045 = getelementptr inbounds float* %tmp9044, i64 1
+  %tmp9046 = getelementptr inbounds float* %tmp9045, i64 1
+  %tmp9047 = getelementptr inbounds float* %tmp9046, i64 1
+  %tmp9048 = getelementptr inbounds float* %tmp9047, i64 1
+  %tmp9049 = getelementptr inbounds float* %tmp9048, i64 1
+  %tmp9050 = getelementptr inbounds float* %tmp9049, i64 1
+  %tmp9051 = getelementptr inbounds float* %tmp9050, i64 1
+  %tmp9052 = getelementptr inbounds float* %tmp9051, i64 1
+  %tmp9053 = getelementptr inbounds float* %tmp9052, i64 1
+  %tmp9054 = getelementptr inbounds float* %tmp9053, i64 1
+  %tmp9055 = getelementptr inbounds float* %tmp9054, i64 1
+  %tmp9056 = getelementptr inbounds float* %tmp9055, i64 1
+  %tmp9057 = getelementptr inbounds float* %tmp9056, i64 1
+  %tmp9058 = getelementptr inbounds float* %tmp9057, i64 1
+  %tmp9059 = getelementptr inbounds float* %tmp9058, i64 1
+  %tmp9060 = getelementptr inbounds float* %tmp9059, i64 1
+  %tmp9061 = getelementptr inbounds float* %tmp9060, i64 1
+  %tmp9062 = getelementptr inbounds float* %tmp9061, i64 1
+  %tmp9063 = getelementptr inbounds float* %tmp9062, i64 1
+  %tmp9064 = getelementptr inbounds float* %tmp9063, i64 1
+  %tmp9065 = getelementptr inbounds float* %tmp9064, i64 1
+  %tmp9066 = getelementptr inbounds float* %tmp9065, i64 1
+  %tmp9067 = getelementptr inbounds float* %tmp9066, i64 1
+  %tmp9068 = getelementptr inbounds float* %tmp9067, i64 1
+  %tmp9069 = getelementptr inbounds float* %tmp9068, i64 1
+  %tmp9070 = getelementptr inbounds float* %tmp9069, i64 1
+  %tmp9071 = getelementptr inbounds float* %tmp9070, i64 1
+  %tmp9072 = getelementptr inbounds float* %tmp9071, i64 1
+  %tmp9073 = getelementptr inbounds float* %tmp9072, i64 1
+  %tmp9074 = getelementptr inbounds float* %tmp9073, i64 1
+  %tmp9075 = getelementptr inbounds float* %tmp9074, i64 1
+  %tmp9076 = getelementptr inbounds float* %tmp9075, i64 1
+  %tmp9077 = getelementptr inbounds float* %tmp9076, i64 1
+  %tmp9078 = getelementptr inbounds float* %tmp9077, i64 1
+  %tmp9079 = getelementptr inbounds float* %tmp9078, i64 1
+  %tmp9080 = getelementptr inbounds float* %tmp9079, i64 1
+  %tmp9081 = getelementptr inbounds float* %tmp9080, i64 1
+  %tmp9082 = getelementptr inbounds float* %tmp9081, i64 1
+  %tmp9083 = getelementptr inbounds float* %tmp9082, i64 1
+  %tmp9084 = getelementptr inbounds float* %tmp9083, i64 1
+  %tmp9085 = getelementptr inbounds float* %tmp9084, i64 1
+  %tmp9086 = getelementptr inbounds float* %tmp9085, i64 1
+  %tmp9087 = getelementptr inbounds float* %tmp9086, i64 1
+  %tmp9088 = getelementptr inbounds float* %tmp9087, i64 1
+  %tmp9089 = getelementptr inbounds float* %tmp9088, i64 1
+  %tmp9090 = getelementptr inbounds float* %tmp9089, i64 1
+  %tmp9091 = getelementptr inbounds float* %tmp9090, i64 1
+  %tmp9092 = getelementptr inbounds float* %tmp9091, i64 1
+  %tmp9093 = getelementptr inbounds float* %tmp9092, i64 1
+  %tmp9094 = getelementptr inbounds float* %tmp9093, i64 1
+  %tmp9095 = getelementptr inbounds float* %tmp9094, i64 1
+  %tmp9096 = getelementptr inbounds float* %tmp9095, i64 1
+  %tmp9097 = getelementptr inbounds float* %tmp9096, i64 1
+  %tmp9098 = getelementptr inbounds float* %tmp9097, i64 1
+  %tmp9099 = getelementptr inbounds float* %tmp9098, i64 1
+  %tmp9100 = getelementptr inbounds float* %tmp9099, i64 1
+  %tmp9101 = getelementptr inbounds float* %tmp9100, i64 1
+  %tmp9102 = getelementptr inbounds float* %tmp9101, i64 1
+  %tmp9103 = getelementptr inbounds float* %tmp9102, i64 1
+  %tmp9104 = getelementptr inbounds float* %tmp9103, i64 1
+  %tmp9105 = getelementptr inbounds float* %tmp9104, i64 1
+  %tmp9106 = getelementptr inbounds float* %tmp9105, i64 1
+  %tmp9107 = getelementptr inbounds float* %tmp9106, i64 1
+  %tmp9108 = getelementptr inbounds float* %tmp9107, i64 1
+  %tmp9109 = getelementptr inbounds float* %tmp9108, i64 1
+  %tmp9110 = getelementptr inbounds float* %tmp9109, i64 1
+  %tmp9111 = getelementptr inbounds float* %tmp9110, i64 1
+  %tmp9112 = getelementptr inbounds float* %tmp9111, i64 1
+  %tmp9113 = getelementptr inbounds float* %tmp9112, i64 1
+  %tmp9114 = getelementptr inbounds float* %tmp9113, i64 1
+  %tmp9115 = getelementptr inbounds float* %tmp9114, i64 1
+  %tmp9116 = getelementptr inbounds float* %tmp9115, i64 1
+  %tmp9117 = getelementptr inbounds float* %tmp9116, i64 1
+  %tmp9118 = getelementptr inbounds float* %tmp9117, i64 1
+  %tmp9119 = getelementptr inbounds float* %tmp9118, i64 1
+  %tmp9120 = getelementptr inbounds float* %tmp9119, i64 1
+  %tmp9121 = getelementptr inbounds float* %tmp9120, i64 1
+  %tmp9122 = getelementptr inbounds float* %tmp9121, i64 1
+  %tmp9123 = getelementptr inbounds float* %tmp9122, i64 1
+  %tmp9124 = getelementptr inbounds float* %tmp9123, i64 1
+  %tmp9125 = getelementptr inbounds float* %tmp9124, i64 1
+  %tmp9126 = getelementptr inbounds float* %tmp9125, i64 1
+  %tmp9127 = getelementptr inbounds float* %tmp9126, i64 1
+  %tmp9128 = getelementptr inbounds float* %tmp9127, i64 1
+  %tmp9129 = getelementptr inbounds float* %tmp9128, i64 1
+  %tmp9130 = getelementptr inbounds float* %tmp9129, i64 1
+  %tmp9131 = getelementptr inbounds float* %tmp9130, i64 1
+  %tmp9132 = getelementptr inbounds float* %tmp9131, i64 1
+  %tmp9133 = getelementptr inbounds float* %tmp9132, i64 1
+  %tmp9134 = getelementptr inbounds float* %tmp9133, i64 1
+  %tmp9135 = getelementptr inbounds float* %tmp9134, i64 1
+  %tmp9136 = getelementptr inbounds float* %tmp9135, i64 1
+  %tmp9137 = getelementptr inbounds float* %tmp9136, i64 1
+  %tmp9138 = getelementptr inbounds float* %tmp9137, i64 1
+  %tmp9139 = getelementptr inbounds float* %tmp9138, i64 1
+  %tmp9140 = getelementptr inbounds float* %tmp9139, i64 1
+  %tmp9141 = getelementptr inbounds float* %tmp9140, i64 1
+  %tmp9142 = getelementptr inbounds float* %tmp9141, i64 1
+  %tmp9143 = getelementptr inbounds float* %tmp9142, i64 1
+  %tmp9144 = getelementptr inbounds float* %tmp9143, i64 1
+  %tmp9145 = getelementptr inbounds float* %tmp9144, i64 1
+  %tmp9146 = getelementptr inbounds float* %tmp9145, i64 1
+  %tmp9147 = getelementptr inbounds float* %tmp9146, i64 1
+  %tmp9148 = getelementptr inbounds float* %tmp9147, i64 1
+  %tmp9149 = getelementptr inbounds float* %tmp9148, i64 1
+  %tmp9150 = getelementptr inbounds float* %tmp9149, i64 1
+  %tmp9151 = getelementptr inbounds float* %tmp9150, i64 1
+  %tmp9152 = getelementptr inbounds float* %tmp9151, i64 1
+  %tmp9153 = getelementptr inbounds float* %tmp9152, i64 1
+  %tmp9154 = getelementptr inbounds float* %tmp9153, i64 1
+  %tmp9155 = getelementptr inbounds float* %tmp9154, i64 1
+  %tmp9156 = getelementptr inbounds float* %tmp9155, i64 1
+  %tmp9157 = getelementptr inbounds float* %tmp9156, i64 1
+  %tmp9158 = getelementptr inbounds float* %tmp9157, i64 1
+  %tmp9159 = getelementptr inbounds float* %tmp9158, i64 1
+  %tmp9160 = getelementptr inbounds float* %tmp9159, i64 1
+  %tmp9161 = getelementptr inbounds float* %tmp9160, i64 1
+  %tmp9162 = getelementptr inbounds float* %tmp9161, i64 1
+  %tmp9163 = getelementptr inbounds float* %tmp9162, i64 1
+  %tmp9164 = getelementptr inbounds float* %tmp9163, i64 1
+  %tmp9165 = getelementptr inbounds float* %tmp9164, i64 1
+  %tmp9166 = getelementptr inbounds float* %tmp9165, i64 1
+  %tmp9167 = getelementptr inbounds float* %tmp9166, i64 1
+  %tmp9168 = getelementptr inbounds float* %tmp9167, i64 1
+  %tmp9169 = getelementptr inbounds float* %tmp9168, i64 1
+  %tmp9170 = getelementptr inbounds float* %tmp9169, i64 1
+  %tmp9171 = getelementptr inbounds float* %tmp9170, i64 1
+  %tmp9172 = getelementptr inbounds float* %tmp9171, i64 1
+  %tmp9173 = getelementptr inbounds float* %tmp9172, i64 1
+  %tmp9174 = getelementptr inbounds float* %tmp9173, i64 1
+  %tmp9175 = getelementptr inbounds float* %tmp9174, i64 1
+  %tmp9176 = getelementptr inbounds float* %tmp9175, i64 1
+  %tmp9177 = getelementptr inbounds float* %tmp9176, i64 1
+  %tmp9178 = getelementptr inbounds float* %tmp9177, i64 1
+  %tmp9179 = getelementptr inbounds float* %tmp9178, i64 1
+  %tmp9180 = getelementptr inbounds float* %tmp9179, i64 1
+  %tmp9181 = getelementptr inbounds float* %tmp9180, i64 1
+  %tmp9182 = getelementptr inbounds float* %tmp9181, i64 1
+  %tmp9183 = getelementptr inbounds float* %tmp9182, i64 1
+  %tmp9184 = getelementptr inbounds float* %tmp9183, i64 1
+  %tmp9185 = getelementptr inbounds float* %tmp9184, i64 1
+  %tmp9186 = getelementptr inbounds float* %tmp9185, i64 1
+  %tmp9187 = getelementptr inbounds float* %tmp9186, i64 1
+  %tmp9188 = getelementptr inbounds float* %tmp9187, i64 1
+  %tmp9189 = getelementptr inbounds float* %tmp9188, i64 1
+  %tmp9190 = getelementptr inbounds float* %tmp9189, i64 1
+  %tmp9191 = getelementptr inbounds float* %tmp9190, i64 1
+  %tmp9192 = getelementptr inbounds float* %tmp9191, i64 1
+  %tmp9193 = getelementptr inbounds float* %tmp9192, i64 1
+  %tmp9194 = getelementptr inbounds float* %tmp9193, i64 1
+  %tmp9195 = getelementptr inbounds float* %tmp9194, i64 1
+  %tmp9196 = getelementptr inbounds float* %tmp9195, i64 1
+  %tmp9197 = getelementptr inbounds float* %tmp9196, i64 1
+  %tmp9198 = getelementptr inbounds float* %tmp9197, i64 1
+  %tmp9199 = getelementptr inbounds float* %tmp9198, i64 1
+  %tmp9200 = getelementptr inbounds float* %tmp9199, i64 1
+  %tmp9201 = getelementptr inbounds float* %tmp9200, i64 1
+  %tmp9202 = getelementptr inbounds float* %tmp9201, i64 1
+  %tmp9203 = getelementptr inbounds float* %tmp9202, i64 1
+  %tmp9204 = getelementptr inbounds float* %tmp9203, i64 1
+  %tmp9205 = getelementptr inbounds float* %tmp9204, i64 1
+  %tmp9206 = getelementptr inbounds float* %tmp9205, i64 1
+  %tmp9207 = getelementptr inbounds float* %tmp9206, i64 1
+  %tmp9208 = getelementptr inbounds float* %tmp9207, i64 1
+  %tmp9209 = getelementptr inbounds float* %tmp9208, i64 1
+  %tmp9210 = getelementptr inbounds float* %tmp9209, i64 1
+  %tmp9211 = getelementptr inbounds float* %tmp9210, i64 1
+  %tmp9212 = getelementptr inbounds float* %tmp9211, i64 1
+  %tmp9213 = getelementptr inbounds float* %tmp9212, i64 1
+  %tmp9214 = getelementptr inbounds float* %tmp9213, i64 1
+  %tmp9215 = getelementptr inbounds float* %tmp9214, i64 1
+  %tmp9216 = getelementptr inbounds float* %tmp9215, i64 1
+  %tmp9217 = getelementptr inbounds float* %tmp9216, i64 1
+  %tmp9218 = getelementptr inbounds float* %tmp9217, i64 1
+  %tmp9219 = getelementptr inbounds float* %tmp9218, i64 1
+  %tmp9220 = getelementptr inbounds float* %tmp9219, i64 1
+  %tmp9221 = getelementptr inbounds float* %tmp9220, i64 1
+  %tmp9222 = getelementptr inbounds float* %tmp9221, i64 1
+  %tmp9223 = getelementptr inbounds float* %tmp9222, i64 1
+  %tmp9224 = getelementptr inbounds float* %tmp9223, i64 1
+  %tmp9225 = getelementptr inbounds float* %tmp9224, i64 1
+  %tmp9226 = getelementptr inbounds float* %tmp9225, i64 1
+  %tmp9227 = getelementptr inbounds float* %tmp9226, i64 1
+  %tmp9228 = getelementptr inbounds float* %tmp9227, i64 1
+  %tmp9229 = getelementptr inbounds float* %tmp9228, i64 1
+  %tmp9230 = getelementptr inbounds float* %tmp9229, i64 1
+  %tmp9231 = getelementptr inbounds float* %tmp9230, i64 1
+  %tmp9232 = getelementptr inbounds float* %tmp9231, i64 1
+  %tmp9233 = getelementptr inbounds float* %tmp9232, i64 1
+  %tmp9234 = getelementptr inbounds float* %tmp9233, i64 1
+  %tmp9235 = getelementptr inbounds float* %tmp9234, i64 1
+  %tmp9236 = getelementptr inbounds float* %tmp9235, i64 1
+  %tmp9237 = getelementptr inbounds float* %tmp9236, i64 1
+  %tmp9238 = getelementptr inbounds float* %tmp9237, i64 1
+  %tmp9239 = getelementptr inbounds float* %tmp9238, i64 1
+  %tmp9240 = getelementptr inbounds float* %tmp9239, i64 1
+  %tmp9241 = getelementptr inbounds float* %tmp9240, i64 1
+  %tmp9242 = getelementptr inbounds float* %tmp9241, i64 1
+  %tmp9243 = getelementptr inbounds float* %tmp9242, i64 1
+  %tmp9244 = getelementptr inbounds float* %tmp9243, i64 1
+  %tmp9245 = getelementptr inbounds float* %tmp9244, i64 1
+  %tmp9246 = getelementptr inbounds float* %tmp9245, i64 1
+  %tmp9247 = getelementptr inbounds float* %tmp9246, i64 1
+  %tmp9248 = getelementptr inbounds float* %tmp9247, i64 1
+  %tmp9249 = getelementptr inbounds float* %tmp9248, i64 1
+  %tmp9250 = getelementptr inbounds float* %tmp9249, i64 1
+  %tmp9251 = getelementptr inbounds float* %tmp9250, i64 1
+  %tmp9252 = getelementptr inbounds float* %tmp9251, i64 1
+  %tmp9253 = getelementptr inbounds float* %tmp9252, i64 1
+  %tmp9254 = getelementptr inbounds float* %tmp9253, i64 1
+  %tmp9255 = getelementptr inbounds float* %tmp9254, i64 1
+  %tmp9256 = getelementptr inbounds float* %tmp9255, i64 1
+  %tmp9257 = getelementptr inbounds float* %tmp9256, i64 1
+  %tmp9258 = getelementptr inbounds float* %tmp9257, i64 1
+  %tmp9259 = getelementptr inbounds float* %tmp9258, i64 1
+  %tmp9260 = getelementptr inbounds float* %tmp9259, i64 1
+  %tmp9261 = getelementptr inbounds float* %tmp9260, i64 1
+  %tmp9262 = getelementptr inbounds float* %tmp9261, i64 1
+  %tmp9263 = getelementptr inbounds float* %tmp9262, i64 1
+  %tmp9264 = getelementptr inbounds float* %tmp9263, i64 1
+  %tmp9265 = getelementptr inbounds float* %tmp9264, i64 1
+  %tmp9266 = getelementptr inbounds float* %tmp9265, i64 1
+  %tmp9267 = getelementptr inbounds float* %tmp9266, i64 1
+  %tmp9268 = getelementptr inbounds float* %tmp9267, i64 1
+  %tmp9269 = getelementptr inbounds float* %tmp9268, i64 1
+  %tmp9270 = getelementptr inbounds float* %tmp9269, i64 1
+  %tmp9271 = getelementptr inbounds float* %tmp9270, i64 1
+  %tmp9272 = getelementptr inbounds float* %tmp9271, i64 1
+  %tmp9273 = getelementptr inbounds float* %tmp9272, i64 1
+  %tmp9274 = getelementptr inbounds float* %tmp9273, i64 1
+  %tmp9275 = getelementptr inbounds float* %tmp9274, i64 1
+  %tmp9276 = getelementptr inbounds float* %tmp9275, i64 1
+  %tmp9277 = getelementptr inbounds float* %tmp9276, i64 1
+  %tmp9278 = getelementptr inbounds float* %tmp9277, i64 1
+  %tmp9279 = getelementptr inbounds float* %tmp9278, i64 1
+  %tmp9280 = getelementptr inbounds float* %tmp9279, i64 1
+  %tmp9281 = getelementptr inbounds float* %tmp9280, i64 1
+  %tmp9282 = getelementptr inbounds float* %tmp9281, i64 1
+  %tmp9283 = getelementptr inbounds float* %tmp9282, i64 1
+  %tmp9284 = getelementptr inbounds float* %tmp9283, i64 1
+  %tmp9285 = getelementptr inbounds float* %tmp9284, i64 1
+  %tmp9286 = getelementptr inbounds float* %tmp9285, i64 1
+  %tmp9287 = getelementptr inbounds float* %tmp9286, i64 1
+  %tmp9288 = getelementptr inbounds float* %tmp9287, i64 1
+  %tmp9289 = getelementptr inbounds float* %tmp9288, i64 1
+  %tmp9290 = getelementptr inbounds float* %tmp9289, i64 1
+  %tmp9291 = getelementptr inbounds float* %tmp9290, i64 1
+  %tmp9292 = getelementptr inbounds float* %tmp9291, i64 1
+  %tmp9293 = getelementptr inbounds float* %tmp9292, i64 1
+  %tmp9294 = getelementptr inbounds float* %tmp9293, i64 1
+  %tmp9295 = getelementptr inbounds float* %tmp9294, i64 1
+  %tmp9296 = getelementptr inbounds float* %tmp9295, i64 1
+  %tmp9297 = getelementptr inbounds float* %tmp9296, i64 1
+  %tmp9298 = getelementptr inbounds float* %tmp9297, i64 1
+  %tmp9299 = getelementptr inbounds float* %tmp9298, i64 1
+  %tmp9300 = getelementptr inbounds float* %tmp9299, i64 1
+  %tmp9301 = getelementptr inbounds float* %tmp9300, i64 1
+  %tmp9302 = getelementptr inbounds float* %tmp9301, i64 1
+  %tmp9303 = getelementptr inbounds float* %tmp9302, i64 1
+  %tmp9304 = getelementptr inbounds float* %tmp9303, i64 1
+  %tmp9305 = getelementptr inbounds float* %tmp9304, i64 1
+  %tmp9306 = getelementptr inbounds float* %tmp9305, i64 1
+  %tmp9307 = getelementptr inbounds float* %tmp9306, i64 1
+  %tmp9308 = getelementptr inbounds float* %tmp9307, i64 1
+  %tmp9309 = getelementptr inbounds float* %tmp9308, i64 1
+  %tmp9310 = getelementptr inbounds float* %tmp9309, i64 1
+  %tmp9311 = getelementptr inbounds float* %tmp9310, i64 1
+  %tmp9312 = getelementptr inbounds float* %tmp9311, i64 1
+  %tmp9313 = getelementptr inbounds float* %tmp9312, i64 1
+  %tmp9314 = getelementptr inbounds float* %tmp9313, i64 1
+  %tmp9315 = getelementptr inbounds float* %tmp9314, i64 1
+  %tmp9316 = getelementptr inbounds float* %tmp9315, i64 1
+  %tmp9317 = getelementptr inbounds float* %tmp9316, i64 1
+  %tmp9318 = getelementptr inbounds float* %tmp9317, i64 1
+  %tmp9319 = getelementptr inbounds float* %tmp9318, i64 1
+  %tmp9320 = getelementptr inbounds float* %tmp9319, i64 1
+  %tmp9321 = getelementptr inbounds float* %tmp9320, i64 1
+  %tmp9322 = getelementptr inbounds float* %tmp9321, i64 1
+  %tmp9323 = getelementptr inbounds float* %tmp9322, i64 1
+  %tmp9324 = getelementptr inbounds float* %tmp9323, i64 1
+  %tmp9325 = getelementptr inbounds float* %tmp9324, i64 1
+  %tmp9326 = getelementptr inbounds float* %tmp9325, i64 1
+  %tmp9327 = getelementptr inbounds float* %tmp9326, i64 1
+  %tmp9328 = getelementptr inbounds float* %tmp9327, i64 1
+  %tmp9329 = getelementptr inbounds float* %tmp9328, i64 1
+  %tmp9330 = getelementptr inbounds float* %tmp9329, i64 1
+  %tmp9331 = getelementptr inbounds float* %tmp9330, i64 1
+  %tmp9332 = getelementptr inbounds float* %tmp9331, i64 1
+  %tmp9333 = getelementptr inbounds float* %tmp9332, i64 1
+  %tmp9334 = getelementptr inbounds float* %tmp9333, i64 1
+  %tmp9335 = getelementptr inbounds float* %tmp9334, i64 1
+  %tmp9336 = getelementptr inbounds float* %tmp9335, i64 1
+  %tmp9337 = getelementptr inbounds float* %tmp9336, i64 1
+  %tmp9338 = getelementptr inbounds float* %tmp9337, i64 1
+  %tmp9339 = getelementptr inbounds float* %tmp9338, i64 1
+  %tmp9340 = getelementptr inbounds float* %tmp9339, i64 1
+  %tmp9341 = getelementptr inbounds float* %tmp9340, i64 1
+  %tmp9342 = getelementptr inbounds float* %tmp9341, i64 1
+  %tmp9343 = getelementptr inbounds float* %tmp9342, i64 1
+  %tmp9344 = getelementptr inbounds float* %tmp9343, i64 1
+  %tmp9345 = getelementptr inbounds float* %tmp9344, i64 1
+  %tmp9346 = getelementptr inbounds float* %tmp9345, i64 1
+  %tmp9347 = getelementptr inbounds float* %tmp9346, i64 1
+  %tmp9348 = getelementptr inbounds float* %tmp9347, i64 1
+  %tmp9349 = getelementptr inbounds float* %tmp9348, i64 1
+  %tmp9350 = getelementptr inbounds float* %tmp9349, i64 1
+  %tmp9351 = getelementptr inbounds float* %tmp9350, i64 1
+  %tmp9352 = getelementptr inbounds float* %tmp9351, i64 1
+  %tmp9353 = getelementptr inbounds float* %tmp9352, i64 1
+  %tmp9354 = getelementptr inbounds float* %tmp9353, i64 1
+  %tmp9355 = getelementptr inbounds float* %tmp9354, i64 1
+  %tmp9356 = getelementptr inbounds float* %tmp9355, i64 1
+  %tmp9357 = getelementptr inbounds float* %tmp9356, i64 1
+  %tmp9358 = getelementptr inbounds float* %tmp9357, i64 1
+  %tmp9359 = getelementptr inbounds float* %tmp9358, i64 1
+  %tmp9360 = getelementptr inbounds float* %tmp9359, i64 1
+  %tmp9361 = getelementptr inbounds float* %tmp9360, i64 1
+  %tmp9362 = getelementptr inbounds float* %tmp9361, i64 1
+  %tmp9363 = getelementptr inbounds float* %tmp9362, i64 1
+  %tmp9364 = getelementptr inbounds float* %tmp9363, i64 1
+  %tmp9365 = getelementptr inbounds float* %tmp9364, i64 1
+  %tmp9366 = getelementptr inbounds float* %tmp9365, i64 1
+  %tmp9367 = getelementptr inbounds float* %tmp9366, i64 1
+  %tmp9368 = getelementptr inbounds float* %tmp9367, i64 1
+  %tmp9369 = getelementptr inbounds float* %tmp9368, i64 1
+  %tmp9370 = getelementptr inbounds float* %tmp9369, i64 1
+  %tmp9371 = getelementptr inbounds float* %tmp9370, i64 1
+  %tmp9372 = getelementptr inbounds float* %tmp9371, i64 1
+  %tmp9373 = getelementptr inbounds float* %tmp9372, i64 1
+  %tmp9374 = getelementptr inbounds float* %tmp9373, i64 1
+  %tmp9375 = getelementptr inbounds float* %tmp9374, i64 1
+  %tmp9376 = getelementptr inbounds float* %tmp9375, i64 1
+  %tmp9377 = getelementptr inbounds float* %tmp9376, i64 1
+  %tmp9378 = getelementptr inbounds float* %tmp9377, i64 1
+  %tmp9379 = getelementptr inbounds float* %tmp9378, i64 1
+  %tmp9380 = getelementptr inbounds float* %tmp9379, i64 1
+  %tmp9381 = getelementptr inbounds float* %tmp9380, i64 1
+  %tmp9382 = getelementptr inbounds float* %tmp9381, i64 1
+  %tmp9383 = getelementptr inbounds float* %tmp9382, i64 1
+  %tmp9384 = getelementptr inbounds float* %tmp9383, i64 1
+  %tmp9385 = getelementptr inbounds float* %tmp9384, i64 1
+  %tmp9386 = getelementptr inbounds float* %tmp9385, i64 1
+  %tmp9387 = getelementptr inbounds float* %tmp9386, i64 1
+  %tmp9388 = getelementptr inbounds float* %tmp9387, i64 1
+  %tmp9389 = getelementptr inbounds float* %tmp9388, i64 1
+  %tmp9390 = getelementptr inbounds float* %tmp9389, i64 1
+  %tmp9391 = getelementptr inbounds float* %tmp9390, i64 1
+  %tmp9392 = getelementptr inbounds float* %tmp9391, i64 1
+  %tmp9393 = getelementptr inbounds float* %tmp9392, i64 1
+  %tmp9394 = getelementptr inbounds float* %tmp9393, i64 1
+  %tmp9395 = getelementptr inbounds float* %tmp9394, i64 1
+  %tmp9396 = getelementptr inbounds float* %tmp9395, i64 1
+  %tmp9397 = getelementptr inbounds float* %tmp9396, i64 1
+  %tmp9398 = getelementptr inbounds float* %tmp9397, i64 1
+  %tmp9399 = getelementptr inbounds float* %tmp9398, i64 1
+  %tmp9400 = getelementptr inbounds float* %tmp9399, i64 1
+  %tmp9401 = getelementptr inbounds float* %tmp9400, i64 1
+  %tmp9402 = getelementptr inbounds float* %tmp9401, i64 1
+  %tmp9403 = getelementptr inbounds float* %tmp9402, i64 1
+  %tmp9404 = getelementptr inbounds float* %tmp9403, i64 1
+  %tmp9405 = getelementptr inbounds float* %tmp9404, i64 1
+  %tmp9406 = getelementptr inbounds float* %tmp9405, i64 1
+  %tmp9407 = getelementptr inbounds float* %tmp9406, i64 1
+  %tmp9408 = getelementptr inbounds float* %tmp9407, i64 1
+  %tmp9409 = getelementptr inbounds float* %tmp9408, i64 1
+  %tmp9410 = getelementptr inbounds float* %tmp9409, i64 1
+  %tmp9411 = getelementptr inbounds float* %tmp9410, i64 1
+  %tmp9412 = getelementptr inbounds float* %tmp9411, i64 1
+  %tmp9413 = getelementptr inbounds float* %tmp9412, i64 1
+  %tmp9414 = getelementptr inbounds float* %tmp9413, i64 1
+  %tmp9415 = getelementptr inbounds float* %tmp9414, i64 1
+  %tmp9416 = getelementptr inbounds float* %tmp9415, i64 1
+  %tmp9417 = getelementptr inbounds float* %tmp9416, i64 1
+  %tmp9418 = getelementptr inbounds float* %tmp9417, i64 1
+  %tmp9419 = getelementptr inbounds float* %tmp9418, i64 1
+  %tmp9420 = getelementptr inbounds float* %tmp9419, i64 1
+  %tmp9421 = getelementptr inbounds float* %tmp9420, i64 1
+  %tmp9422 = getelementptr inbounds float* %tmp9421, i64 1
+  %tmp9423 = getelementptr inbounds float* %tmp9422, i64 1
+  %tmp9424 = getelementptr inbounds float* %tmp9423, i64 1
+  %tmp9425 = getelementptr inbounds float* %tmp9424, i64 1
+  %tmp9426 = getelementptr inbounds float* %tmp9425, i64 1
+  %tmp9427 = getelementptr inbounds float* %tmp9426, i64 1
+  %tmp9428 = getelementptr inbounds float* %tmp9427, i64 1
+  %tmp9429 = getelementptr inbounds float* %tmp9428, i64 1
+  %tmp9430 = getelementptr inbounds float* %tmp9429, i64 1
+  %tmp9431 = getelementptr inbounds float* %tmp9430, i64 1
+  %tmp9432 = getelementptr inbounds float* %tmp9431, i64 1
+  %tmp9433 = getelementptr inbounds float* %tmp9432, i64 1
+  %tmp9434 = getelementptr inbounds float* %tmp9433, i64 1
+  %tmp9435 = getelementptr inbounds float* %tmp9434, i64 1
+  %tmp9436 = getelementptr inbounds float* %tmp9435, i64 1
+  %tmp9437 = getelementptr inbounds float* %tmp9436, i64 1
+  %tmp9438 = getelementptr inbounds float* %tmp9437, i64 1
+  %tmp9439 = getelementptr inbounds float* %tmp9438, i64 1
+  %tmp9440 = getelementptr inbounds float* %tmp9439, i64 1
+  %tmp9441 = getelementptr inbounds float* %tmp9440, i64 1
+  %tmp9442 = getelementptr inbounds float* %tmp9441, i64 1
+  %tmp9443 = getelementptr inbounds float* %tmp9442, i64 1
+  %tmp9444 = getelementptr inbounds float* %tmp9443, i64 1
+  %tmp9445 = getelementptr inbounds float* %tmp9444, i64 1
+  %tmp9446 = getelementptr inbounds float* %tmp9445, i64 1
+  %tmp9447 = getelementptr inbounds float* %tmp9446, i64 1
+  %tmp9448 = getelementptr inbounds float* %tmp9447, i64 1
+  %tmp9449 = getelementptr inbounds float* %tmp9448, i64 1
+  %tmp9450 = getelementptr inbounds float* %tmp9449, i64 1
+  %tmp9451 = getelementptr inbounds float* %tmp9450, i64 1
+  %tmp9452 = getelementptr inbounds float* %tmp9451, i64 1
+  %tmp9453 = getelementptr inbounds float* %tmp9452, i64 1
+  %tmp9454 = getelementptr inbounds float* %tmp9453, i64 1
+  %tmp9455 = getelementptr inbounds float* %tmp9454, i64 1
+  %tmp9456 = getelementptr inbounds float* %tmp9455, i64 1
+  %tmp9457 = getelementptr inbounds float* %tmp9456, i64 1
+  %tmp9458 = getelementptr inbounds float* %tmp9457, i64 1
+  %tmp9459 = getelementptr inbounds float* %tmp9458, i64 1
+  %tmp9460 = getelementptr inbounds float* %tmp9459, i64 1
+  %tmp9461 = getelementptr inbounds float* %tmp9460, i64 1
+  %tmp9462 = getelementptr inbounds float* %tmp9461, i64 1
+  %tmp9463 = getelementptr inbounds float* %tmp9462, i64 1
+  %tmp9464 = getelementptr inbounds float* %tmp9463, i64 1
+  %tmp9465 = getelementptr inbounds float* %tmp9464, i64 1
+  %tmp9466 = getelementptr inbounds float* %tmp9465, i64 1
+  %tmp9467 = getelementptr inbounds float* %tmp9466, i64 1
+  %tmp9468 = getelementptr inbounds float* %tmp9467, i64 1
+  %tmp9469 = getelementptr inbounds float* %tmp9468, i64 1
+  %tmp9470 = getelementptr inbounds float* %tmp9469, i64 1
+  %tmp9471 = getelementptr inbounds float* %tmp9470, i64 1
+  %tmp9472 = getelementptr inbounds float* %tmp9471, i64 1
+  %tmp9473 = getelementptr inbounds float* %tmp9472, i64 1
+  %tmp9474 = getelementptr inbounds float* %tmp9473, i64 1
+  %tmp9475 = getelementptr inbounds float* %tmp9474, i64 1
+  %tmp9476 = getelementptr inbounds float* %tmp9475, i64 1
+  %tmp9477 = getelementptr inbounds float* %tmp9476, i64 1
+  %tmp9478 = getelementptr inbounds float* %tmp9477, i64 1
+  %tmp9479 = getelementptr inbounds float* %tmp9478, i64 1
+  %tmp9480 = getelementptr inbounds float* %tmp9479, i64 1
+  %tmp9481 = getelementptr inbounds float* %tmp9480, i64 1
+  %tmp9482 = getelementptr inbounds float* %tmp9481, i64 1
+  %tmp9483 = getelementptr inbounds float* %tmp9482, i64 1
+  %tmp9484 = getelementptr inbounds float* %tmp9483, i64 1
+  %tmp9485 = getelementptr inbounds float* %tmp9484, i64 1
+  %tmp9486 = getelementptr inbounds float* %tmp9485, i64 1
+  %tmp9487 = getelementptr inbounds float* %tmp9486, i64 1
+  %tmp9488 = getelementptr inbounds float* %tmp9487, i64 1
+  %tmp9489 = getelementptr inbounds float* %tmp9488, i64 1
+  %tmp9490 = getelementptr inbounds float* %tmp9489, i64 1
+  %tmp9491 = getelementptr inbounds float* %tmp9490, i64 1
+  %tmp9492 = getelementptr inbounds float* %tmp9491, i64 1
+  %tmp9493 = getelementptr inbounds float* %tmp9492, i64 1
+  %tmp9494 = getelementptr inbounds float* %tmp9493, i64 1
+  %tmp9495 = getelementptr inbounds float* %tmp9494, i64 1
+  %tmp9496 = getelementptr inbounds float* %tmp9495, i64 1
+  %tmp9497 = getelementptr inbounds float* %tmp9496, i64 1
+  %tmp9498 = getelementptr inbounds float* %tmp9497, i64 1
+  %tmp9499 = getelementptr inbounds float* %tmp9498, i64 1
+  %tmp9500 = getelementptr inbounds float* %tmp9499, i64 1
+  %tmp9501 = getelementptr inbounds float* %tmp9500, i64 1
+  %tmp9502 = getelementptr inbounds float* %tmp9501, i64 1
+  %tmp9503 = getelementptr inbounds float* %tmp9502, i64 1
+  %tmp9504 = getelementptr inbounds float* %tmp9503, i64 1
+  %tmp9505 = getelementptr inbounds float* %tmp9504, i64 1
+  %tmp9506 = getelementptr inbounds float* %tmp9505, i64 1
+  %tmp9507 = getelementptr inbounds float* %tmp9506, i64 1
+  %tmp9508 = getelementptr inbounds float* %tmp9507, i64 1
+  %tmp9509 = getelementptr inbounds float* %tmp9508, i64 1
+  %tmp9510 = getelementptr inbounds float* %tmp9509, i64 1
+  %tmp9511 = getelementptr inbounds float* %tmp9510, i64 1
+  %tmp9512 = getelementptr inbounds float* %tmp9511, i64 1
+  %tmp9513 = getelementptr inbounds float* %tmp9512, i64 1
+  %tmp9514 = getelementptr inbounds float* %tmp9513, i64 1
+  %tmp9515 = getelementptr inbounds float* %tmp9514, i64 1
+  %tmp9516 = getelementptr inbounds float* %tmp9515, i64 1
+  %tmp9517 = getelementptr inbounds float* %tmp9516, i64 1
+  %tmp9518 = getelementptr inbounds float* %tmp9517, i64 1
+  %tmp9519 = getelementptr inbounds float* %tmp9518, i64 1
+  %tmp9520 = getelementptr inbounds float* %tmp9519, i64 1
+  %tmp9521 = getelementptr inbounds float* %tmp9520, i64 1
+  %tmp9522 = getelementptr inbounds float* %tmp9521, i64 1
+  %tmp9523 = getelementptr inbounds float* %tmp9522, i64 1
+  %tmp9524 = getelementptr inbounds float* %tmp9523, i64 1
+  %tmp9525 = getelementptr inbounds float* %tmp9524, i64 1
+  %tmp9526 = getelementptr inbounds float* %tmp9525, i64 1
+  %tmp9527 = getelementptr inbounds float* %tmp9526, i64 1
+  %tmp9528 = getelementptr inbounds float* %tmp9527, i64 1
+  %tmp9529 = getelementptr inbounds float* %tmp9528, i64 1
+  %tmp9530 = getelementptr inbounds float* %tmp9529, i64 1
+  %tmp9531 = getelementptr inbounds float* %tmp9530, i64 1
+  %tmp9532 = getelementptr inbounds float* %tmp9531, i64 1
+  %tmp9533 = getelementptr inbounds float* %tmp9532, i64 1
+  %tmp9534 = getelementptr inbounds float* %tmp9533, i64 1
+  %tmp9535 = getelementptr inbounds float* %tmp9534, i64 1
+  %tmp9536 = getelementptr inbounds float* %tmp9535, i64 1
+  %tmp9537 = getelementptr inbounds float* %tmp9536, i64 1
+  %tmp9538 = getelementptr inbounds float* %tmp9537, i64 1
+  %tmp9539 = getelementptr inbounds float* %tmp9538, i64 1
+  %tmp9540 = getelementptr inbounds float* %tmp9539, i64 1
+  %tmp9541 = getelementptr inbounds float* %tmp9540, i64 1
+  %tmp9542 = getelementptr inbounds float* %tmp9541, i64 1
+  %tmp9543 = getelementptr inbounds float* %tmp9542, i64 1
+  %tmp9544 = getelementptr inbounds float* %tmp9543, i64 1
+  %tmp9545 = getelementptr inbounds float* %tmp9544, i64 1
+  %tmp9546 = getelementptr inbounds float* %tmp9545, i64 1
+  %tmp9547 = getelementptr inbounds float* %tmp9546, i64 1
+  %tmp9548 = getelementptr inbounds float* %tmp9547, i64 1
+  %tmp9549 = getelementptr inbounds float* %tmp9548, i64 1
+  %tmp9550 = getelementptr inbounds float* %tmp9549, i64 1
+  %tmp9551 = getelementptr inbounds float* %tmp9550, i64 1
+  %tmp9552 = getelementptr inbounds float* %tmp9551, i64 1
+  %tmp9553 = getelementptr inbounds float* %tmp9552, i64 1
+  %tmp9554 = getelementptr inbounds float* %tmp9553, i64 1
+  %tmp9555 = getelementptr inbounds float* %tmp9554, i64 1
+  %tmp9556 = getelementptr inbounds float* %tmp9555, i64 1
+  %tmp9557 = getelementptr inbounds float* %tmp9556, i64 1
+  %tmp9558 = getelementptr inbounds float* %tmp9557, i64 1
+  %tmp9559 = getelementptr inbounds float* %tmp9558, i64 1
+  %tmp9560 = getelementptr inbounds float* %tmp9559, i64 1
+  %tmp9561 = getelementptr inbounds float* %tmp9560, i64 1
+  %tmp9562 = getelementptr inbounds float* %tmp9561, i64 1
+  %tmp9563 = getelementptr inbounds float* %tmp9562, i64 1
+  %tmp9564 = getelementptr inbounds float* %tmp9563, i64 1
+  %tmp9565 = getelementptr inbounds float* %tmp9564, i64 1
+  %tmp9566 = getelementptr inbounds float* %tmp9565, i64 1
+  %tmp9567 = getelementptr inbounds float* %tmp9566, i64 1
+  %tmp9568 = getelementptr inbounds float* %tmp9567, i64 1
+  %tmp9569 = getelementptr inbounds float* %tmp9568, i64 1
+  %tmp9570 = getelementptr inbounds float* %tmp9569, i64 1
+  %tmp9571 = getelementptr inbounds float* %tmp9570, i64 1
+  %tmp9572 = getelementptr inbounds float* %tmp9571, i64 1
+  %tmp9573 = getelementptr inbounds float* %tmp9572, i64 1
+  %tmp9574 = getelementptr inbounds float* %tmp9573, i64 1
+  %tmp9575 = getelementptr inbounds float* %tmp9574, i64 1
+  %tmp9576 = getelementptr inbounds float* %tmp9575, i64 1
+  %tmp9577 = getelementptr inbounds float* %tmp9576, i64 1
+  %tmp9578 = getelementptr inbounds float* %tmp9577, i64 1
+  %tmp9579 = getelementptr inbounds float* %tmp9578, i64 1
+  %tmp9580 = getelementptr inbounds float* %tmp9579, i64 1
+  %tmp9581 = getelementptr inbounds float* %tmp9580, i64 1
+  %tmp9582 = getelementptr inbounds float* %tmp9581, i64 1
+  %tmp9583 = getelementptr inbounds float* %tmp9582, i64 1
+  %tmp9584 = getelementptr inbounds float* %tmp9583, i64 1
+  %tmp9585 = getelementptr inbounds float* %tmp9584, i64 1
+  %tmp9586 = getelementptr inbounds float* %tmp9585, i64 1
+  %tmp9587 = getelementptr inbounds float* %tmp9586, i64 1
+  %tmp9588 = getelementptr inbounds float* %tmp9587, i64 1
+  %tmp9589 = getelementptr inbounds float* %tmp9588, i64 1
+  %tmp9590 = getelementptr inbounds float* %tmp9589, i64 1
+  %tmp9591 = getelementptr inbounds float* %tmp9590, i64 1
+  %tmp9592 = getelementptr inbounds float* %tmp9591, i64 1
+  %tmp9593 = getelementptr inbounds float* %tmp9592, i64 1
+  %tmp9594 = getelementptr inbounds float* %tmp9593, i64 1
+  %tmp9595 = getelementptr inbounds float* %tmp9594, i64 1
+  %tmp9596 = getelementptr inbounds float* %tmp9595, i64 1
+  %tmp9597 = getelementptr inbounds float* %tmp9596, i64 1
+  %tmp9598 = getelementptr inbounds float* %tmp9597, i64 1
+  %tmp9599 = getelementptr inbounds float* %tmp9598, i64 1
+  %tmp9600 = getelementptr inbounds float* %tmp9599, i64 1
+  %tmp9601 = getelementptr inbounds float* %tmp9600, i64 1
+  %tmp9602 = getelementptr inbounds float* %tmp9601, i64 1
+  %tmp9603 = getelementptr inbounds float* %tmp9602, i64 1
+  %tmp9604 = getelementptr inbounds float* %tmp9603, i64 1
+  %tmp9605 = getelementptr inbounds float* %tmp9604, i64 1
+  %tmp9606 = getelementptr inbounds float* %tmp9605, i64 1
+  %tmp9607 = getelementptr inbounds float* %tmp9606, i64 1
+  %tmp9608 = getelementptr inbounds float* %tmp9607, i64 1
+  %tmp9609 = getelementptr inbounds float* %tmp9608, i64 1
+  %tmp9610 = getelementptr inbounds float* %tmp9609, i64 1
+  %tmp9611 = getelementptr inbounds float* %tmp9610, i64 1
+  %tmp9612 = getelementptr inbounds float* %tmp9611, i64 1
+  %tmp9613 = getelementptr inbounds float* %tmp9612, i64 1
+  %tmp9614 = getelementptr inbounds float* %tmp9613, i64 1
+  %tmp9615 = getelementptr inbounds float* %tmp9614, i64 1
+  %tmp9616 = getelementptr inbounds float* %tmp9615, i64 1
+  %tmp9617 = getelementptr inbounds float* %tmp9616, i64 1
+  %tmp9618 = getelementptr inbounds float* %tmp9617, i64 1
+  %tmp9619 = getelementptr inbounds float* %tmp9618, i64 1
+  %tmp9620 = getelementptr inbounds float* %tmp9619, i64 1
+  %tmp9621 = getelementptr inbounds float* %tmp9620, i64 1
+  %tmp9622 = getelementptr inbounds float* %tmp9621, i64 1
+  %tmp9623 = getelementptr inbounds float* %tmp9622, i64 1
+  %tmp9624 = getelementptr inbounds float* %tmp9623, i64 1
+  %tmp9625 = getelementptr inbounds float* %tmp9624, i64 1
+  %tmp9626 = getelementptr inbounds float* %tmp9625, i64 1
+  %tmp9627 = getelementptr inbounds float* %tmp9626, i64 1
+  %tmp9628 = getelementptr inbounds float* %tmp9627, i64 1
+  %tmp9629 = getelementptr inbounds float* %tmp9628, i64 1
+  %tmp9630 = getelementptr inbounds float* %tmp9629, i64 1
+  %tmp9631 = getelementptr inbounds float* %tmp9630, i64 1
+  %tmp9632 = getelementptr inbounds float* %tmp9631, i64 1
+  %tmp9633 = getelementptr inbounds float* %tmp9632, i64 1
+  %tmp9634 = getelementptr inbounds float* %tmp9633, i64 1
+  %tmp9635 = getelementptr inbounds float* %tmp9634, i64 1
+  %tmp9636 = getelementptr inbounds float* %tmp9635, i64 1
+  %tmp9637 = getelementptr inbounds float* %tmp9636, i64 1
+  %tmp9638 = getelementptr inbounds float* %tmp9637, i64 1
+  %tmp9639 = getelementptr inbounds float* %tmp9638, i64 1
+  %tmp9640 = getelementptr inbounds float* %tmp9639, i64 1
+  %tmp9641 = getelementptr inbounds float* %tmp9640, i64 1
+  %tmp9642 = getelementptr inbounds float* %tmp9641, i64 1
+  %tmp9643 = getelementptr inbounds float* %tmp9642, i64 1
+  %tmp9644 = getelementptr inbounds float* %tmp9643, i64 1
+  %tmp9645 = getelementptr inbounds float* %tmp9644, i64 1
+  %tmp9646 = getelementptr inbounds float* %tmp9645, i64 1
+  %tmp9647 = getelementptr inbounds float* %tmp9646, i64 1
+  %tmp9648 = getelementptr inbounds float* %tmp9647, i64 1
+  %tmp9649 = getelementptr inbounds float* %tmp9648, i64 1
+  %tmp9650 = getelementptr inbounds float* %tmp9649, i64 1
+  %tmp9651 = getelementptr inbounds float* %tmp9650, i64 1
+  %tmp9652 = getelementptr inbounds float* %tmp9651, i64 1
+  %tmp9653 = getelementptr inbounds float* %tmp9652, i64 1
+  %tmp9654 = getelementptr inbounds float* %tmp9653, i64 1
+  %tmp9655 = getelementptr inbounds float* %tmp9654, i64 1
+  %tmp9656 = getelementptr inbounds float* %tmp9655, i64 1
+  %tmp9657 = getelementptr inbounds float* %tmp9656, i64 1
+  %tmp9658 = getelementptr inbounds float* %tmp9657, i64 1
+  %tmp9659 = getelementptr inbounds float* %tmp9658, i64 1
+  %tmp9660 = getelementptr inbounds float* %tmp9659, i64 1
+  %tmp9661 = getelementptr inbounds float* %tmp9660, i64 1
+  %tmp9662 = getelementptr inbounds float* %tmp9661, i64 1
+  %tmp9663 = getelementptr inbounds float* %tmp9662, i64 1
+  %tmp9664 = getelementptr inbounds float* %tmp9663, i64 1
+  %tmp9665 = getelementptr inbounds float* %tmp9664, i64 1
+  %tmp9666 = getelementptr inbounds float* %tmp9665, i64 1
+  %tmp9667 = getelementptr inbounds float* %tmp9666, i64 1
+  %tmp9668 = getelementptr inbounds float* %tmp9667, i64 1
+  %tmp9669 = getelementptr inbounds float* %tmp9668, i64 1
+  %tmp9670 = getelementptr inbounds float* %tmp9669, i64 1
+  %tmp9671 = getelementptr inbounds float* %tmp9670, i64 1
+  %tmp9672 = getelementptr inbounds float* %tmp9671, i64 1
+  %tmp9673 = getelementptr inbounds float* %tmp9672, i64 1
+  %tmp9674 = getelementptr inbounds float* %tmp9673, i64 1
+  %tmp9675 = getelementptr inbounds float* %tmp9674, i64 1
+  %tmp9676 = getelementptr inbounds float* %tmp9675, i64 1
+  %tmp9677 = getelementptr inbounds float* %tmp9676, i64 1
+  %tmp9678 = getelementptr inbounds float* %tmp9677, i64 1
+  %tmp9679 = getelementptr inbounds float* %tmp9678, i64 1
+  %tmp9680 = getelementptr inbounds float* %tmp9679, i64 1
+  %tmp9681 = getelementptr inbounds float* %tmp9680, i64 1
+  %tmp9682 = getelementptr inbounds float* %tmp9681, i64 1
+  %tmp9683 = getelementptr inbounds float* %tmp9682, i64 1
+  %tmp9684 = getelementptr inbounds float* %tmp9683, i64 1
+  %tmp9685 = getelementptr inbounds float* %tmp9684, i64 1
+  %tmp9686 = getelementptr inbounds float* %tmp9685, i64 1
+  %tmp9687 = getelementptr inbounds float* %tmp9686, i64 1
+  %tmp9688 = getelementptr inbounds float* %tmp9687, i64 1
+  %tmp9689 = getelementptr inbounds float* %tmp9688, i64 1
+  %tmp9690 = getelementptr inbounds float* %tmp9689, i64 1
+  %tmp9691 = getelementptr inbounds float* %tmp9690, i64 1
+  %tmp9692 = getelementptr inbounds float* %tmp9691, i64 1
+  %tmp9693 = getelementptr inbounds float* %tmp9692, i64 1
+  %tmp9694 = getelementptr inbounds float* %tmp9693, i64 1
+  %tmp9695 = getelementptr inbounds float* %tmp9694, i64 1
+  %tmp9696 = getelementptr inbounds float* %tmp9695, i64 1
+  %tmp9697 = getelementptr inbounds float* %tmp9696, i64 1
+  %tmp9698 = getelementptr inbounds float* %tmp9697, i64 1
+  %tmp9699 = getelementptr inbounds float* %tmp9698, i64 1
+  %tmp9700 = getelementptr inbounds float* %tmp9699, i64 1
+  %tmp9701 = getelementptr inbounds float* %tmp9700, i64 1
+  %tmp9702 = getelementptr inbounds float* %tmp9701, i64 1
+  %tmp9703 = getelementptr inbounds float* %tmp9702, i64 1
+  %tmp9704 = getelementptr inbounds float* %tmp9703, i64 1
+  %tmp9705 = getelementptr inbounds float* %tmp9704, i64 1
+  %tmp9706 = getelementptr inbounds float* %tmp9705, i64 1
+  %tmp9707 = getelementptr inbounds float* %tmp9706, i64 1
+  %tmp9708 = getelementptr inbounds float* %tmp9707, i64 1
+  %tmp9709 = getelementptr inbounds float* %tmp9708, i64 1
+  %tmp9710 = getelementptr inbounds float* %tmp9709, i64 1
+  %tmp9711 = getelementptr inbounds float* %tmp9710, i64 1
+  %tmp9712 = getelementptr inbounds float* %tmp9711, i64 1
+  %tmp9713 = getelementptr inbounds float* %tmp9712, i64 1
+  %tmp9714 = getelementptr inbounds float* %tmp9713, i64 1
+  %tmp9715 = getelementptr inbounds float* %tmp9714, i64 1
+  %tmp9716 = getelementptr inbounds float* %tmp9715, i64 1
+  %tmp9717 = getelementptr inbounds float* %tmp9716, i64 1
+  %tmp9718 = getelementptr inbounds float* %tmp9717, i64 1
+  %tmp9719 = getelementptr inbounds float* %tmp9718, i64 1
+  %tmp9720 = getelementptr inbounds float* %tmp9719, i64 1
+  %tmp9721 = getelementptr inbounds float* %tmp9720, i64 1
+  %tmp9722 = getelementptr inbounds float* %tmp9721, i64 1
+  %tmp9723 = getelementptr inbounds float* %tmp9722, i64 1
+  %tmp9724 = getelementptr inbounds float* %tmp9723, i64 1
+  %tmp9725 = getelementptr inbounds float* %tmp9724, i64 1
+  %tmp9726 = getelementptr inbounds float* %tmp9725, i64 1
+  %tmp9727 = getelementptr inbounds float* %tmp9726, i64 1
+  %tmp9728 = getelementptr inbounds float* %tmp9727, i64 1
+  %tmp9729 = getelementptr inbounds float* %tmp9728, i64 1
+  %tmp9730 = getelementptr inbounds float* %tmp9729, i64 1
+  %tmp9731 = getelementptr inbounds float* %tmp9730, i64 1
+  %tmp9732 = getelementptr inbounds float* %tmp9731, i64 1
+  %tmp9733 = getelementptr inbounds float* %tmp9732, i64 1
+  %tmp9734 = getelementptr inbounds float* %tmp9733, i64 1
+  %tmp9735 = getelementptr inbounds float* %tmp9734, i64 1
+  %tmp9736 = getelementptr inbounds float* %tmp9735, i64 1
+  %tmp9737 = getelementptr inbounds float* %tmp9736, i64 1
+  %tmp9738 = getelementptr inbounds float* %tmp9737, i64 1
+  %tmp9739 = getelementptr inbounds float* %tmp9738, i64 1
+  %tmp9740 = getelementptr inbounds float* %tmp9739, i64 1
+  %tmp9741 = getelementptr inbounds float* %tmp9740, i64 1
+  %tmp9742 = getelementptr inbounds float* %tmp9741, i64 1
+  %tmp9743 = getelementptr inbounds float* %tmp9742, i64 1
+  %tmp9744 = getelementptr inbounds float* %tmp9743, i64 1
+  %tmp9745 = getelementptr inbounds float* %tmp9744, i64 1
+  %tmp9746 = getelementptr inbounds float* %tmp9745, i64 1
+  %tmp9747 = getelementptr inbounds float* %tmp9746, i64 1
+  %tmp9748 = getelementptr inbounds float* %tmp9747, i64 1
+  %tmp9749 = getelementptr inbounds float* %tmp9748, i64 1
+  %tmp9750 = getelementptr inbounds float* %tmp9749, i64 1
+  %tmp9751 = getelementptr inbounds float* %tmp9750, i64 1
+  %tmp9752 = getelementptr inbounds float* %tmp9751, i64 1
+  %tmp9753 = getelementptr inbounds float* %tmp9752, i64 1
+  %tmp9754 = getelementptr inbounds float* %tmp9753, i64 1
+  %tmp9755 = getelementptr inbounds float* %tmp9754, i64 1
+  %tmp9756 = getelementptr inbounds float* %tmp9755, i64 1
+  %tmp9757 = getelementptr inbounds float* %tmp9756, i64 1
+  %tmp9758 = getelementptr inbounds float* %tmp9757, i64 1
+  %tmp9759 = getelementptr inbounds float* %tmp9758, i64 1
+  %tmp9760 = getelementptr inbounds float* %tmp9759, i64 1
+  %tmp9761 = getelementptr inbounds float* %tmp9760, i64 1
+  %tmp9762 = getelementptr inbounds float* %tmp9761, i64 1
+  %tmp9763 = getelementptr inbounds float* %tmp9762, i64 1
+  %tmp9764 = getelementptr inbounds float* %tmp9763, i64 1
+  %tmp9765 = getelementptr inbounds float* %tmp9764, i64 1
+  %tmp9766 = getelementptr inbounds float* %tmp9765, i64 1
+  %tmp9767 = getelementptr inbounds float* %tmp9766, i64 1
+  %tmp9768 = getelementptr inbounds float* %tmp9767, i64 1
+  %tmp9769 = getelementptr inbounds float* %tmp9768, i64 1
+  %tmp9770 = getelementptr inbounds float* %tmp9769, i64 1
+  %tmp9771 = getelementptr inbounds float* %tmp9770, i64 1
+  %tmp9772 = getelementptr inbounds float* %tmp9771, i64 1
+  %tmp9773 = getelementptr inbounds float* %tmp9772, i64 1
+  %tmp9774 = getelementptr inbounds float* %tmp9773, i64 1
+  %tmp9775 = getelementptr inbounds float* %tmp9774, i64 1
+  %tmp9776 = getelementptr inbounds float* %tmp9775, i64 1
+  %tmp9777 = getelementptr inbounds float* %tmp9776, i64 1
+  %tmp9778 = getelementptr inbounds float* %tmp9777, i64 1
+  %tmp9779 = getelementptr inbounds float* %tmp9778, i64 1
+  %tmp9780 = getelementptr inbounds float* %tmp9779, i64 1
+  %tmp9781 = getelementptr inbounds float* %tmp9780, i64 1
+  %tmp9782 = getelementptr inbounds float* %tmp9781, i64 1
+  %tmp9783 = getelementptr inbounds float* %tmp9782, i64 1
+  %tmp9784 = getelementptr inbounds float* %tmp9783, i64 1
+  %tmp9785 = getelementptr inbounds float* %tmp9784, i64 1
+  %tmp9786 = getelementptr inbounds float* %tmp9785, i64 1
+  %tmp9787 = getelementptr inbounds float* %tmp9786, i64 1
+  %tmp9788 = getelementptr inbounds float* %tmp9787, i64 1
+  %tmp9789 = getelementptr inbounds float* %tmp9788, i64 1
+  %tmp9790 = getelementptr inbounds float* %tmp9789, i64 1
+  %tmp9791 = getelementptr inbounds float* %tmp9790, i64 1
+  %tmp9792 = getelementptr inbounds float* %tmp9791, i64 1
+  %tmp9793 = getelementptr inbounds float* %tmp9792, i64 1
+  %tmp9794 = getelementptr inbounds float* %tmp9793, i64 1
+  %tmp9795 = getelementptr inbounds float* %tmp9794, i64 1
+  %tmp9796 = getelementptr inbounds float* %tmp9795, i64 1
+  %tmp9797 = getelementptr inbounds float* %tmp9796, i64 1
+  %tmp9798 = getelementptr inbounds float* %tmp9797, i64 1
+  %tmp9799 = getelementptr inbounds float* %tmp9798, i64 1
+  %tmp9800 = getelementptr inbounds float* %tmp9799, i64 1
+  %tmp9801 = getelementptr inbounds float* %tmp9800, i64 1
+  %tmp9802 = getelementptr inbounds float* %tmp9801, i64 1
+  %tmp9803 = getelementptr inbounds float* %tmp9802, i64 1
+  %tmp9804 = getelementptr inbounds float* %tmp9803, i64 1
+  %tmp9805 = getelementptr inbounds float* %tmp9804, i64 1
+  %tmp9806 = getelementptr inbounds float* %tmp9805, i64 1
+  %tmp9807 = getelementptr inbounds float* %tmp9806, i64 1
+  %tmp9808 = getelementptr inbounds float* %tmp9807, i64 1
+  %tmp9809 = getelementptr inbounds float* %tmp9808, i64 1
+  %tmp9810 = getelementptr inbounds float* %tmp9809, i64 1
+  %tmp9811 = getelementptr inbounds float* %tmp9810, i64 1
+  %tmp9812 = getelementptr inbounds float* %tmp9811, i64 1
+  %tmp9813 = getelementptr inbounds float* %tmp9812, i64 1
+  %tmp9814 = getelementptr inbounds float* %tmp9813, i64 1
+  %tmp9815 = getelementptr inbounds float* %tmp9814, i64 1
+  %tmp9816 = getelementptr inbounds float* %tmp9815, i64 1
+  %tmp9817 = getelementptr inbounds float* %tmp9816, i64 1
+  %tmp9818 = getelementptr inbounds float* %tmp9817, i64 1
+  %tmp9819 = getelementptr inbounds float* %tmp9818, i64 1
+  %tmp9820 = getelementptr inbounds float* %tmp9819, i64 1
+  %tmp9821 = getelementptr inbounds float* %tmp9820, i64 1
+  %tmp9822 = getelementptr inbounds float* %tmp9821, i64 1
+  %tmp9823 = getelementptr inbounds float* %tmp9822, i64 1
+  %tmp9824 = getelementptr inbounds float* %tmp9823, i64 1
+  %tmp9825 = getelementptr inbounds float* %tmp9824, i64 1
+  %tmp9826 = getelementptr inbounds float* %tmp9825, i64 1
+  %tmp9827 = getelementptr inbounds float* %tmp9826, i64 1
+  %tmp9828 = getelementptr inbounds float* %tmp9827, i64 1
+  %tmp9829 = getelementptr inbounds float* %tmp9828, i64 1
+  %tmp9830 = getelementptr inbounds float* %tmp9829, i64 1
+  %tmp9831 = getelementptr inbounds float* %tmp9830, i64 1
+  %tmp9832 = getelementptr inbounds float* %tmp9831, i64 1
+  %tmp9833 = getelementptr inbounds float* %tmp9832, i64 1
+  %tmp9834 = getelementptr inbounds float* %tmp9833, i64 1
+  %tmp9835 = getelementptr inbounds float* %tmp9834, i64 1
+  %tmp9836 = getelementptr inbounds float* %tmp9835, i64 1
+  %tmp9837 = getelementptr inbounds float* %tmp9836, i64 1
+  %tmp9838 = getelementptr inbounds float* %tmp9837, i64 1
+  %tmp9839 = getelementptr inbounds float* %tmp9838, i64 1
+  %tmp9840 = getelementptr inbounds float* %tmp9839, i64 1
+  %tmp9841 = getelementptr inbounds float* %tmp9840, i64 1
+  %tmp9842 = getelementptr inbounds float* %tmp9841, i64 1
+  %tmp9843 = getelementptr inbounds float* %tmp9842, i64 1
+  %tmp9844 = getelementptr inbounds float* %tmp9843, i64 1
+  %tmp9845 = getelementptr inbounds float* %tmp9844, i64 1
+  %tmp9846 = getelementptr inbounds float* %tmp9845, i64 1
+  %tmp9847 = getelementptr inbounds float* %tmp9846, i64 1
+  %tmp9848 = getelementptr inbounds float* %tmp9847, i64 1
+  %tmp9849 = getelementptr inbounds float* %tmp9848, i64 1
+  %tmp9850 = getelementptr inbounds float* %tmp9849, i64 1
+  %tmp9851 = getelementptr inbounds float* %tmp9850, i64 1
+  %tmp9852 = getelementptr inbounds float* %tmp9851, i64 1
+  %tmp9853 = getelementptr inbounds float* %tmp9852, i64 1
+  %tmp9854 = getelementptr inbounds float* %tmp9853, i64 1
+  %tmp9855 = getelementptr inbounds float* %tmp9854, i64 1
+  %tmp9856 = getelementptr inbounds float* %tmp9855, i64 1
+  %tmp9857 = getelementptr inbounds float* %tmp9856, i64 1
+  %tmp9858 = getelementptr inbounds float* %tmp9857, i64 1
+  %tmp9859 = getelementptr inbounds float* %tmp9858, i64 1
+  %tmp9860 = getelementptr inbounds float* %tmp9859, i64 1
+  %tmp9861 = getelementptr inbounds float* %tmp9860, i64 1
+  %tmp9862 = getelementptr inbounds float* %tmp9861, i64 1
+  %tmp9863 = getelementptr inbounds float* %tmp9862, i64 1
+  %tmp9864 = getelementptr inbounds float* %tmp9863, i64 1
+  %tmp9865 = getelementptr inbounds float* %tmp9864, i64 1
+  %tmp9866 = getelementptr inbounds float* %tmp9865, i64 1
+  %tmp9867 = getelementptr inbounds float* %tmp9866, i64 1
+  %tmp9868 = getelementptr inbounds float* %tmp9867, i64 1
+  %tmp9869 = getelementptr inbounds float* %tmp9868, i64 1
+  %tmp9870 = getelementptr inbounds float* %tmp9869, i64 1
+  %tmp9871 = getelementptr inbounds float* %tmp9870, i64 1
+  %tmp9872 = getelementptr inbounds float* %tmp9871, i64 1
+  %tmp9873 = getelementptr inbounds float* %tmp9872, i64 1
+  %tmp9874 = getelementptr inbounds float* %tmp9873, i64 1
+  %tmp9875 = getelementptr inbounds float* %tmp9874, i64 1
+  %tmp9876 = getelementptr inbounds float* %tmp9875, i64 1
+  %tmp9877 = getelementptr inbounds float* %tmp9876, i64 1
+  %tmp9878 = getelementptr inbounds float* %tmp9877, i64 1
+  %tmp9879 = getelementptr inbounds float* %tmp9878, i64 1
+  %tmp9880 = getelementptr inbounds float* %tmp9879, i64 1
+  %tmp9881 = getelementptr inbounds float* %tmp9880, i64 1
+  %tmp9882 = getelementptr inbounds float* %tmp9881, i64 1
+  %tmp9883 = getelementptr inbounds float* %tmp9882, i64 1
+  %tmp9884 = getelementptr inbounds float* %tmp9883, i64 1
+  %tmp9885 = getelementptr inbounds float* %tmp9884, i64 1
+  %tmp9886 = getelementptr inbounds float* %tmp9885, i64 1
+  %tmp9887 = getelementptr inbounds float* %tmp9886, i64 1
+  %tmp9888 = getelementptr inbounds float* %tmp9887, i64 1
+  %tmp9889 = getelementptr inbounds float* %tmp9888, i64 1
+  %tmp9890 = getelementptr inbounds float* %tmp9889, i64 1
+  %tmp9891 = getelementptr inbounds float* %tmp9890, i64 1
+  %tmp9892 = getelementptr inbounds float* %tmp9891, i64 1
+  %tmp9893 = getelementptr inbounds float* %tmp9892, i64 1
+  %tmp9894 = getelementptr inbounds float* %tmp9893, i64 1
+  %tmp9895 = getelementptr inbounds float* %tmp9894, i64 1
+  %tmp9896 = getelementptr inbounds float* %tmp9895, i64 1
+  %tmp9897 = getelementptr inbounds float* %tmp9896, i64 1
+  %tmp9898 = getelementptr inbounds float* %tmp9897, i64 1
+  %tmp9899 = getelementptr inbounds float* %tmp9898, i64 1
+  %tmp9900 = getelementptr inbounds float* %tmp9899, i64 1
+  %tmp9901 = getelementptr inbounds float* %tmp9900, i64 1
+  %tmp9902 = getelementptr inbounds float* %tmp9901, i64 1
+  %tmp9903 = getelementptr inbounds float* %tmp9902, i64 1
+  %tmp9904 = getelementptr inbounds float* %tmp9903, i64 1
+  %tmp9905 = getelementptr inbounds float* %tmp9904, i64 1
+  %tmp9906 = getelementptr inbounds float* %tmp9905, i64 1
+  %tmp9907 = getelementptr inbounds float* %tmp9906, i64 1
+  %tmp9908 = getelementptr inbounds float* %tmp9907, i64 1
+  %tmp9909 = getelementptr inbounds float* %tmp9908, i64 1
+  %tmp9910 = getelementptr inbounds float* %tmp9909, i64 1
+  %tmp9911 = getelementptr inbounds float* %tmp9910, i64 1
+  %tmp9912 = getelementptr inbounds float* %tmp9911, i64 1
+  %tmp9913 = getelementptr inbounds float* %tmp9912, i64 1
+  %tmp9914 = getelementptr inbounds float* %tmp9913, i64 1
+  %tmp9915 = getelementptr inbounds float* %tmp9914, i64 1
+  %tmp9916 = getelementptr inbounds float* %tmp9915, i64 1
+  %tmp9917 = getelementptr inbounds float* %tmp9916, i64 1
+  %tmp9918 = getelementptr inbounds float* %tmp9917, i64 1
+  %tmp9919 = getelementptr inbounds float* %tmp9918, i64 1
+  %tmp9920 = getelementptr inbounds float* %tmp9919, i64 1
+  %tmp9921 = getelementptr inbounds float* %tmp9920, i64 1
+  %tmp9922 = getelementptr inbounds float* %tmp9921, i64 1
+  %tmp9923 = getelementptr inbounds float* %tmp9922, i64 1
+  %tmp9924 = getelementptr inbounds float* %tmp9923, i64 1
+  %tmp9925 = getelementptr inbounds float* %tmp9924, i64 1
+  %tmp9926 = getelementptr inbounds float* %tmp9925, i64 1
+  %tmp9927 = getelementptr inbounds float* %tmp9926, i64 1
+  %tmp9928 = getelementptr inbounds float* %tmp9927, i64 1
+  %tmp9929 = getelementptr inbounds float* %tmp9928, i64 1
+  %tmp9930 = getelementptr inbounds float* %tmp9929, i64 1
+  %tmp9931 = getelementptr inbounds float* %tmp9930, i64 1
+  %tmp9932 = getelementptr inbounds float* %tmp9931, i64 1
+  %tmp9933 = getelementptr inbounds float* %tmp9932, i64 1
+  %tmp9934 = getelementptr inbounds float* %tmp9933, i64 1
+  %tmp9935 = getelementptr inbounds float* %tmp9934, i64 1
+  %tmp9936 = getelementptr inbounds float* %tmp9935, i64 1
+  %tmp9937 = getelementptr inbounds float* %tmp9936, i64 1
+  %tmp9938 = getelementptr inbounds float* %tmp9937, i64 1
+  %tmp9939 = getelementptr inbounds float* %tmp9938, i64 1
+  %tmp9940 = getelementptr inbounds float* %tmp9939, i64 1
+  %tmp9941 = getelementptr inbounds float* %tmp9940, i64 1
+  %tmp9942 = getelementptr inbounds float* %tmp9941, i64 1
+  %tmp9943 = getelementptr inbounds float* %tmp9942, i64 1
+  %tmp9944 = getelementptr inbounds float* %tmp9943, i64 1
+  %tmp9945 = getelementptr inbounds float* %tmp9944, i64 1
+  %tmp9946 = getelementptr inbounds float* %tmp9945, i64 1
+  %tmp9947 = getelementptr inbounds float* %tmp9946, i64 1
+  %tmp9948 = getelementptr inbounds float* %tmp9947, i64 1
+  %tmp9949 = getelementptr inbounds float* %tmp9948, i64 1
+  %tmp9950 = getelementptr inbounds float* %tmp9949, i64 1
+  %tmp9951 = getelementptr inbounds float* %tmp9950, i64 1
+  %tmp9952 = getelementptr inbounds float* %tmp9951, i64 1
+  %tmp9953 = getelementptr inbounds float* %tmp9952, i64 1
+  %tmp9954 = getelementptr inbounds float* %tmp9953, i64 1
+  %tmp9955 = getelementptr inbounds float* %tmp9954, i64 1
+  %tmp9956 = getelementptr inbounds float* %tmp9955, i64 1
+  %tmp9957 = getelementptr inbounds float* %tmp9956, i64 1
+  %tmp9958 = getelementptr inbounds float* %tmp9957, i64 1
+  %tmp9959 = getelementptr inbounds float* %tmp9958, i64 1
+  %tmp9960 = getelementptr inbounds float* %tmp9959, i64 1
+  %tmp9961 = getelementptr inbounds float* %tmp9960, i64 1
+  %tmp9962 = getelementptr inbounds float* %tmp9961, i64 1
+  %tmp9963 = getelementptr inbounds float* %tmp9962, i64 1
+  %tmp9964 = getelementptr inbounds float* %tmp9963, i64 1
+  %tmp9965 = getelementptr inbounds float* %tmp9964, i64 1
+  %tmp9966 = getelementptr inbounds float* %tmp9965, i64 1
+  %tmp9967 = getelementptr inbounds float* %tmp9966, i64 1
+  %tmp9968 = getelementptr inbounds float* %tmp9967, i64 1
+  %tmp9969 = getelementptr inbounds float* %tmp9968, i64 1
+  %tmp9970 = getelementptr inbounds float* %tmp9969, i64 1
+  %tmp9971 = getelementptr inbounds float* %tmp9970, i64 1
+  %tmp9972 = getelementptr inbounds float* %tmp9971, i64 1
+  %tmp9973 = getelementptr inbounds float* %tmp9972, i64 1
+  %tmp9974 = getelementptr inbounds float* %tmp9973, i64 1
+  %tmp9975 = getelementptr inbounds float* %tmp9974, i64 1
+  %tmp9976 = getelementptr inbounds float* %tmp9975, i64 1
+  %tmp9977 = getelementptr inbounds float* %tmp9976, i64 1
+  %tmp9978 = getelementptr inbounds float* %tmp9977, i64 1
+  %tmp9979 = getelementptr inbounds float* %tmp9978, i64 1
+  %tmp9980 = getelementptr inbounds float* %tmp9979, i64 1
+  %tmp9981 = getelementptr inbounds float* %tmp9980, i64 1
+  %tmp9982 = getelementptr inbounds float* %tmp9981, i64 1
+  %tmp9983 = getelementptr inbounds float* %tmp9982, i64 1
+  %tmp9984 = getelementptr inbounds float* %tmp9983, i64 1
+  %tmp9985 = getelementptr inbounds float* %tmp9984, i64 1
+  %tmp9986 = getelementptr inbounds float* %tmp9985, i64 1
+  %tmp9987 = getelementptr inbounds float* %tmp9986, i64 1
+  %tmp9988 = getelementptr inbounds float* %tmp9987, i64 1
+  %tmp9989 = getelementptr inbounds float* %tmp9988, i64 1
+  %tmp9990 = getelementptr inbounds float* %tmp9989, i64 1
+  %tmp9991 = getelementptr inbounds float* %tmp9990, i64 1
+  %tmp9992 = getelementptr inbounds float* %tmp9991, i64 1
+  %tmp9993 = getelementptr inbounds float* %tmp9992, i64 1
+  %tmp9994 = getelementptr inbounds float* %tmp9993, i64 1
+  %tmp9995 = getelementptr inbounds float* %tmp9994, i64 1
+  %tmp9996 = getelementptr inbounds float* %tmp9995, i64 1
+  %tmp9997 = getelementptr inbounds float* %tmp9996, i64 1
+  %tmp9998 = getelementptr inbounds float* %tmp9997, i64 1
+  %tmp9999 = getelementptr inbounds float* %tmp9998, i64 1
+  %tmp10000 = getelementptr inbounds float* %tmp9999, i64 1
+  %tmp10001 = getelementptr inbounds float* %tmp10000, i64 1
+  %tmp10002 = getelementptr inbounds float* %tmp10001, i64 1
+  %tmp10003 = getelementptr inbounds float* %tmp10002, i64 1
+  %tmp10004 = getelementptr inbounds float* %tmp10003, i64 1
+  %tmp10005 = getelementptr inbounds float* %tmp10004, i64 1
+  %tmp10006 = getelementptr inbounds float* %tmp10005, i64 1
+  %tmp10007 = getelementptr inbounds float* %tmp10006, i64 1
+  %tmp10008 = getelementptr inbounds float* %tmp10007, i64 1
+  %tmp10009 = getelementptr inbounds float* %tmp10008, i64 1
+  %tmp10010 = getelementptr inbounds float* %tmp10009, i64 1
+  %tmp10011 = getelementptr inbounds float* %tmp10010, i64 1
+  %tmp10012 = getelementptr inbounds float* %tmp10011, i64 1
+  %tmp10013 = getelementptr inbounds float* %tmp10012, i64 1
+  %tmp10014 = getelementptr inbounds float* %tmp10013, i64 1
+  %tmp10015 = getelementptr inbounds float* %tmp10014, i64 1
+  %tmp10016 = getelementptr inbounds float* %tmp10015, i64 1
+  %tmp10017 = getelementptr inbounds float* %tmp10016, i64 1
+  %tmp10018 = getelementptr inbounds float* %tmp10017, i64 1
+  %tmp10019 = getelementptr inbounds float* %tmp10018, i64 1
+  %tmp10020 = getelementptr inbounds float* %tmp10019, i64 1
+  %tmp10021 = getelementptr inbounds float* %tmp10020, i64 1
+  %tmp10022 = getelementptr inbounds float* %tmp10021, i64 1
+  %tmp10023 = getelementptr inbounds float* %tmp10022, i64 1
+  %tmp10024 = getelementptr inbounds float* %tmp10023, i64 1
+  %tmp10025 = getelementptr inbounds float* %tmp10024, i64 1
+  %tmp10026 = getelementptr inbounds float* %tmp10025, i64 1
+  %tmp10027 = getelementptr inbounds float* %tmp10026, i64 1
+  %tmp10028 = getelementptr inbounds float* %tmp10027, i64 1
+  %tmp10029 = getelementptr inbounds float* %tmp10028, i64 1
+  %tmp10030 = getelementptr inbounds float* %tmp10029, i64 1
+  %tmp10031 = getelementptr inbounds float* %tmp10030, i64 1
+  %tmp10032 = getelementptr inbounds float* %tmp10031, i64 1
+  %tmp10033 = getelementptr inbounds float* %tmp10032, i64 1
+  %tmp10034 = getelementptr inbounds float* %tmp10033, i64 1
+  %tmp10035 = getelementptr inbounds float* %tmp10034, i64 1
+  %tmp10036 = getelementptr inbounds float* %tmp10035, i64 1
+  %tmp10037 = getelementptr inbounds float* %tmp10036, i64 1
+  %tmp10038 = getelementptr inbounds float* %tmp10037, i64 1
+  %tmp10039 = getelementptr inbounds float* %tmp10038, i64 1
+  %tmp10040 = getelementptr inbounds float* %tmp10039, i64 1
+  %tmp10041 = getelementptr inbounds float* %tmp10040, i64 1
+  %tmp10042 = getelementptr inbounds float* %tmp10041, i64 1
+  %tmp10043 = getelementptr inbounds float* %tmp10042, i64 1
+  %tmp10044 = getelementptr inbounds float* %tmp10043, i64 1
+  %tmp10045 = getelementptr inbounds float* %tmp10044, i64 1
+  %tmp10046 = getelementptr inbounds float* %tmp10045, i64 1
+  %tmp10047 = getelementptr inbounds float* %tmp10046, i64 1
+  %tmp10048 = getelementptr inbounds float* %tmp10047, i64 1
+  %tmp10049 = getelementptr inbounds float* %tmp10048, i64 1
+  %tmp10050 = getelementptr inbounds float* %tmp10049, i64 1
+  %tmp10051 = getelementptr inbounds float* %tmp10050, i64 1
+  %tmp10052 = getelementptr inbounds float* %tmp10051, i64 1
+  %tmp10053 = getelementptr inbounds float* %tmp10052, i64 1
+  %tmp10054 = getelementptr inbounds float* %tmp10053, i64 1
+  %tmp10055 = getelementptr inbounds float* %tmp10054, i64 1
+  %tmp10056 = getelementptr inbounds float* %tmp10055, i64 1
+  %tmp10057 = getelementptr inbounds float* %tmp10056, i64 1
+  %tmp10058 = getelementptr inbounds float* %tmp10057, i64 1
+  %tmp10059 = getelementptr inbounds float* %tmp10058, i64 1
+  %tmp10060 = getelementptr inbounds float* %tmp10059, i64 1
+  %tmp10061 = getelementptr inbounds float* %tmp10060, i64 1
+  %tmp10062 = getelementptr inbounds float* %tmp10061, i64 1
+  %tmp10063 = getelementptr inbounds float* %tmp10062, i64 1
+  %tmp10064 = getelementptr inbounds float* %tmp10063, i64 1
+  %tmp10065 = getelementptr inbounds float* %tmp10064, i64 1
+  %tmp10066 = getelementptr inbounds float* %tmp10065, i64 1
+  %tmp10067 = getelementptr inbounds float* %tmp10066, i64 1
+  %tmp10068 = getelementptr inbounds float* %tmp10067, i64 1
+  %tmp10069 = getelementptr inbounds float* %tmp10068, i64 1
+  %tmp10070 = getelementptr inbounds float* %tmp10069, i64 1
+  %tmp10071 = getelementptr inbounds float* %tmp10070, i64 1
+  %tmp10072 = getelementptr inbounds float* %tmp10071, i64 1
+  %tmp10073 = getelementptr inbounds float* %tmp10072, i64 1
+  %tmp10074 = getelementptr inbounds float* %tmp10073, i64 1
+  %tmp10075 = getelementptr inbounds float* %tmp10074, i64 1
+  %tmp10076 = getelementptr inbounds float* %tmp10075, i64 1
+  %tmp10077 = getelementptr inbounds float* %tmp10076, i64 1
+  %tmp10078 = getelementptr inbounds float* %tmp10077, i64 1
+  %tmp10079 = getelementptr inbounds float* %tmp10078, i64 1
+  %tmp10080 = getelementptr inbounds float* %tmp10079, i64 1
+  %tmp10081 = getelementptr inbounds float* %tmp10080, i64 1
+  %tmp10082 = getelementptr inbounds float* %tmp10081, i64 1
+  %tmp10083 = getelementptr inbounds float* %tmp10082, i64 1
+  %tmp10084 = getelementptr inbounds float* %tmp10083, i64 1
+  %tmp10085 = getelementptr inbounds float* %tmp10084, i64 1
+  %tmp10086 = getelementptr inbounds float* %tmp10085, i64 1
+  %tmp10087 = getelementptr inbounds float* %tmp10086, i64 1
+  %tmp10088 = getelementptr inbounds float* %tmp10087, i64 1
+  %tmp10089 = getelementptr inbounds float* %tmp10088, i64 1
+  %tmp10090 = getelementptr inbounds float* %tmp10089, i64 1
+  %tmp10091 = getelementptr inbounds float* %tmp10090, i64 1
+  %tmp10092 = getelementptr inbounds float* %tmp10091, i64 1
+  %tmp10093 = getelementptr inbounds float* %tmp10092, i64 1
+  %tmp10094 = getelementptr inbounds float* %tmp10093, i64 1
+  %tmp10095 = getelementptr inbounds float* %tmp10094, i64 1
+  %tmp10096 = getelementptr inbounds float* %tmp10095, i64 1
+  %tmp10097 = getelementptr inbounds float* %tmp10096, i64 1
+  %tmp10098 = getelementptr inbounds float* %tmp10097, i64 1
+  %tmp10099 = getelementptr inbounds float* %tmp10098, i64 1
+  %tmp10100 = getelementptr inbounds float* %tmp10099, i64 1
+  %tmp10101 = getelementptr inbounds float* %tmp10100, i64 1
+  %tmp10102 = getelementptr inbounds float* %tmp10101, i64 1
+  %tmp10103 = getelementptr inbounds float* %tmp10102, i64 1
+  %tmp10104 = getelementptr inbounds float* %tmp10103, i64 1
+  %tmp10105 = getelementptr inbounds float* %tmp10104, i64 1
+  %tmp10106 = getelementptr inbounds float* %tmp10105, i64 1
+  %tmp10107 = getelementptr inbounds float* %tmp10106, i64 1
+  %tmp10108 = getelementptr inbounds float* %tmp10107, i64 1
+  %tmp10109 = getelementptr inbounds float* %tmp10108, i64 1
+  %tmp10110 = getelementptr inbounds float* %tmp10109, i64 1
+  %tmp10111 = getelementptr inbounds float* %tmp10110, i64 1
+  %tmp10112 = getelementptr inbounds float* %tmp10111, i64 1
+  %tmp10113 = getelementptr inbounds float* %tmp10112, i64 1
+  %tmp10114 = getelementptr inbounds float* %tmp10113, i64 1
+  %tmp10115 = getelementptr inbounds float* %tmp10114, i64 1
+  %tmp10116 = getelementptr inbounds float* %tmp10115, i64 1
+  %tmp10117 = getelementptr inbounds float* %tmp10116, i64 1
+  %tmp10118 = getelementptr inbounds float* %tmp10117, i64 1
+  %tmp10119 = getelementptr inbounds float* %tmp10118, i64 1
+  %tmp10120 = getelementptr inbounds float* %tmp10119, i64 1
+  %tmp10121 = getelementptr inbounds float* %tmp10120, i64 1
+  %tmp10122 = getelementptr inbounds float* %tmp10121, i64 1
+  %tmp10123 = getelementptr inbounds float* %tmp10122, i64 1
+  %tmp10124 = getelementptr inbounds float* %tmp10123, i64 1
+  %tmp10125 = getelementptr inbounds float* %tmp10124, i64 1
+  %tmp10126 = getelementptr inbounds float* %tmp10125, i64 1
+  %tmp10127 = getelementptr inbounds float* %tmp10126, i64 1
+  %tmp10128 = getelementptr inbounds float* %tmp10127, i64 1
+  %tmp10129 = getelementptr inbounds float* %tmp10128, i64 1
+  %tmp10130 = getelementptr inbounds float* %tmp10129, i64 1
+  %tmp10131 = getelementptr inbounds float* %tmp10130, i64 1
+  %tmp10132 = getelementptr inbounds float* %tmp10131, i64 1
+  %tmp10133 = getelementptr inbounds float* %tmp10132, i64 1
+  %tmp10134 = getelementptr inbounds float* %tmp10133, i64 1
+  %tmp10135 = getelementptr inbounds float* %tmp10134, i64 1
+  %tmp10136 = getelementptr inbounds float* %tmp10135, i64 1
+  %tmp10137 = getelementptr inbounds float* %tmp10136, i64 1
+  %tmp10138 = getelementptr inbounds float* %tmp10137, i64 1
+  %tmp10139 = getelementptr inbounds float* %tmp10138, i64 1
+  %tmp10140 = getelementptr inbounds float* %tmp10139, i64 1
+  %tmp10141 = getelementptr inbounds float* %tmp10140, i64 1
+  %tmp10142 = getelementptr inbounds float* %tmp10141, i64 1
+  %tmp10143 = getelementptr inbounds float* %tmp10142, i64 1
+  %tmp10144 = getelementptr inbounds float* %tmp10143, i64 1
+  %tmp10145 = getelementptr inbounds float* %tmp10144, i64 1
+  %tmp10146 = getelementptr inbounds float* %tmp10145, i64 1
+  %tmp10147 = getelementptr inbounds float* %tmp10146, i64 1
+  %tmp10148 = getelementptr inbounds float* %tmp10147, i64 1
+  %tmp10149 = getelementptr inbounds float* %tmp10148, i64 1
+  %tmp10150 = getelementptr inbounds float* %tmp10149, i64 1
+  %tmp10151 = getelementptr inbounds float* %tmp10150, i64 1
+  %tmp10152 = getelementptr inbounds float* %tmp10151, i64 1
+  %tmp10153 = getelementptr inbounds float* %tmp10152, i64 1
+  %tmp10154 = getelementptr inbounds float* %tmp10153, i64 1
+  %tmp10155 = getelementptr inbounds float* %tmp10154, i64 1
+  %tmp10156 = getelementptr inbounds float* %tmp10155, i64 1
+  %tmp10157 = getelementptr inbounds float* %tmp10156, i64 1
+  %tmp10158 = getelementptr inbounds float* %tmp10157, i64 1
+  %tmp10159 = getelementptr inbounds float* %tmp10158, i64 1
+  %tmp10160 = getelementptr inbounds float* %tmp10159, i64 1
+  %tmp10161 = getelementptr inbounds float* %tmp10160, i64 1
+  %tmp10162 = getelementptr inbounds float* %tmp10161, i64 1
+  %tmp10163 = getelementptr inbounds float* %tmp10162, i64 1
+  %tmp10164 = getelementptr inbounds float* %tmp10163, i64 1
+  %tmp10165 = getelementptr inbounds float* %tmp10164, i64 1
+  %tmp10166 = getelementptr inbounds float* %tmp10165, i64 1
+  %tmp10167 = getelementptr inbounds float* %tmp10166, i64 1
+  %tmp10168 = getelementptr inbounds float* %tmp10167, i64 1
+  %tmp10169 = getelementptr inbounds float* %tmp10168, i64 1
+  %tmp10170 = getelementptr inbounds float* %tmp10169, i64 1
+  %tmp10171 = getelementptr inbounds float* %tmp10170, i64 1
+  %tmp10172 = getelementptr inbounds float* %tmp10171, i64 1
+  %tmp10173 = getelementptr inbounds float* %tmp10172, i64 1
+  %tmp10174 = getelementptr inbounds float* %tmp10173, i64 1
+  %tmp10175 = getelementptr inbounds float* %tmp10174, i64 1
+  %tmp10176 = getelementptr inbounds float* %tmp10175, i64 1
+  %tmp10177 = getelementptr inbounds float* %tmp10176, i64 1
+  %tmp10178 = getelementptr inbounds float* %tmp10177, i64 1
+  %tmp10179 = getelementptr inbounds float* %tmp10178, i64 1
+  %tmp10180 = getelementptr inbounds float* %tmp10179, i64 1
+  %tmp10181 = getelementptr inbounds float* %tmp10180, i64 1
+  %tmp10182 = getelementptr inbounds float* %tmp10181, i64 1
+  %tmp10183 = getelementptr inbounds float* %tmp10182, i64 1
+  %tmp10184 = getelementptr inbounds float* %tmp10183, i64 1
+  %tmp10185 = getelementptr inbounds float* %tmp10184, i64 1
+  %tmp10186 = getelementptr inbounds float* %tmp10185, i64 1
+  %tmp10187 = getelementptr inbounds float* %tmp10186, i64 1
+  %tmp10188 = getelementptr inbounds float* %tmp10187, i64 1
+  %tmp10189 = getelementptr inbounds float* %tmp10188, i64 1
+  %tmp10190 = getelementptr inbounds float* %tmp10189, i64 1
+  %tmp10191 = getelementptr inbounds float* %tmp10190, i64 1
+  %tmp10192 = getelementptr inbounds float* %tmp10191, i64 1
+  %tmp10193 = getelementptr inbounds float* %tmp10192, i64 1
+  %tmp10194 = getelementptr inbounds float* %tmp10193, i64 1
+  %tmp10195 = getelementptr inbounds float* %tmp10194, i64 1
+  %tmp10196 = getelementptr inbounds float* %tmp10195, i64 1
+  %tmp10197 = getelementptr inbounds float* %tmp10196, i64 1
+  %tmp10198 = getelementptr inbounds float* %tmp10197, i64 1
+  %tmp10199 = getelementptr inbounds float* %tmp10198, i64 1
+  %tmp10200 = getelementptr inbounds float* %tmp10199, i64 1
+  %tmp10201 = getelementptr inbounds float* %tmp10200, i64 1
+  %tmp10202 = getelementptr inbounds float* %tmp10201, i64 1
+  %tmp10203 = getelementptr inbounds float* %tmp10202, i64 1
+  %tmp10204 = getelementptr inbounds float* %tmp10203, i64 1
+  %tmp10205 = getelementptr inbounds float* %tmp10204, i64 1
+  %tmp10206 = getelementptr inbounds float* %tmp10205, i64 1
+  %tmp10207 = getelementptr inbounds float* %tmp10206, i64 1
+  %tmp10208 = getelementptr inbounds float* %tmp10207, i64 1
+  %tmp10209 = getelementptr inbounds float* %tmp10208, i64 1
+  %tmp10210 = getelementptr inbounds float* %tmp10209, i64 1
+  %tmp10211 = getelementptr inbounds float* %tmp10210, i64 1
+  %tmp10212 = getelementptr inbounds float* %tmp10211, i64 1
+  %tmp10213 = getelementptr inbounds float* %tmp10212, i64 1
+  %tmp10214 = getelementptr inbounds float* %tmp10213, i64 1
+  %tmp10215 = getelementptr inbounds float* %tmp10214, i64 1
+  %tmp10216 = getelementptr inbounds float* %tmp10215, i64 1
+  %tmp10217 = getelementptr inbounds float* %tmp10216, i64 1
+  %tmp10218 = getelementptr inbounds float* %tmp10217, i64 1
+  %tmp10219 = getelementptr inbounds float* %tmp10218, i64 1
+  %tmp10220 = getelementptr inbounds float* %tmp10219, i64 1
+  %tmp10221 = getelementptr inbounds float* %tmp10220, i64 1
+  %tmp10222 = getelementptr inbounds float* %tmp10221, i64 1
+  %tmp10223 = getelementptr inbounds float* %tmp10222, i64 1
+  %tmp10224 = getelementptr inbounds float* %tmp10223, i64 1
+  %tmp10225 = getelementptr inbounds float* %tmp10224, i64 1
+  %tmp10226 = getelementptr inbounds float* %tmp10225, i64 1
+  %tmp10227 = getelementptr inbounds float* %tmp10226, i64 1
+  %tmp10228 = getelementptr inbounds float* %tmp10227, i64 1
+  %tmp10229 = getelementptr inbounds float* %tmp10228, i64 1
+  %tmp10230 = getelementptr inbounds float* %tmp10229, i64 1
+  %tmp10231 = getelementptr inbounds float* %tmp10230, i64 1
+  %tmp10232 = getelementptr inbounds float* %tmp10231, i64 1
+  %tmp10233 = getelementptr inbounds float* %tmp10232, i64 1
+  %tmp10234 = getelementptr inbounds float* %tmp10233, i64 1
+  %tmp10235 = getelementptr inbounds float* %tmp10234, i64 1
+  %tmp10236 = getelementptr inbounds float* %tmp10235, i64 1
+  %tmp10237 = getelementptr inbounds float* %tmp10236, i64 1
+  %tmp10238 = getelementptr inbounds float* %tmp10237, i64 1
+  %tmp10239 = getelementptr inbounds float* %tmp10238, i64 1
+  %tmp10240 = getelementptr inbounds float* %tmp10239, i64 1
+  %tmp10241 = getelementptr inbounds float* %tmp10240, i64 1
+  %tmp10242 = getelementptr inbounds float* %tmp10241, i64 1
+  %tmp10243 = getelementptr inbounds float* %tmp10242, i64 1
+  %tmp10244 = getelementptr inbounds float* %tmp10243, i64 1
+  %tmp10245 = getelementptr inbounds float* %tmp10244, i64 1
+  %tmp10246 = getelementptr inbounds float* %tmp10245, i64 1
+  %tmp10247 = getelementptr inbounds float* %tmp10246, i64 1
+  %tmp10248 = getelementptr inbounds float* %tmp10247, i64 1
+  %tmp10249 = getelementptr inbounds float* %tmp10248, i64 1
+  %tmp10250 = getelementptr inbounds float* %tmp10249, i64 1
+  %tmp10251 = getelementptr inbounds float* %tmp10250, i64 1
+  %tmp10252 = getelementptr inbounds float* %tmp10251, i64 1
+  %tmp10253 = getelementptr inbounds float* %tmp10252, i64 1
+  %tmp10254 = getelementptr inbounds float* %tmp10253, i64 1
+  %tmp10255 = getelementptr inbounds float* %tmp10254, i64 1
+  %tmp10256 = getelementptr inbounds float* %tmp10255, i64 1
+  %tmp10257 = getelementptr inbounds float* %tmp10256, i64 1
+  %tmp10258 = getelementptr inbounds float* %tmp10257, i64 1
+  %tmp10259 = getelementptr inbounds float* %tmp10258, i64 1
+  %tmp10260 = getelementptr inbounds float* %tmp10259, i64 1
+  %tmp10261 = getelementptr inbounds float* %tmp10260, i64 1
+  %tmp10262 = getelementptr inbounds float* %tmp10261, i64 1
+  %tmp10263 = getelementptr inbounds float* %tmp10262, i64 1
+  %tmp10264 = getelementptr inbounds float* %tmp10263, i64 1
+  %tmp10265 = getelementptr inbounds float* %tmp10264, i64 1
+  %tmp10266 = getelementptr inbounds float* %tmp10265, i64 1
+  %tmp10267 = getelementptr inbounds float* %tmp10266, i64 1
+  %tmp10268 = getelementptr inbounds float* %tmp10267, i64 1
+  %tmp10269 = getelementptr inbounds float* %tmp10268, i64 1
+  %tmp10270 = getelementptr inbounds float* %tmp10269, i64 1
+  %tmp10271 = getelementptr inbounds float* %tmp10270, i64 1
+  %tmp10272 = getelementptr inbounds float* %tmp10271, i64 1
+  %tmp10273 = getelementptr inbounds float* %tmp10272, i64 1
+  %tmp10274 = getelementptr inbounds float* %tmp10273, i64 1
+  %tmp10275 = getelementptr inbounds float* %tmp10274, i64 1
+  %tmp10276 = getelementptr inbounds float* %tmp10275, i64 1
+  %tmp10277 = getelementptr inbounds float* %tmp10276, i64 1
+  %tmp10278 = getelementptr inbounds float* %tmp10277, i64 1
+  %tmp10279 = getelementptr inbounds float* %tmp10278, i64 1
+  %tmp10280 = getelementptr inbounds float* %tmp10279, i64 1
+  %tmp10281 = getelementptr inbounds float* %tmp10280, i64 1
+  %tmp10282 = getelementptr inbounds float* %tmp10281, i64 1
+  %tmp10283 = getelementptr inbounds float* %tmp10282, i64 1
+  %tmp10284 = getelementptr inbounds float* %tmp10283, i64 1
+  %tmp10285 = getelementptr inbounds float* %tmp10284, i64 1
+  %tmp10286 = getelementptr inbounds float* %tmp10285, i64 1
+  %tmp10287 = getelementptr inbounds float* %tmp10286, i64 1
+  %tmp10288 = getelementptr inbounds float* %tmp10287, i64 1
+  %tmp10289 = getelementptr inbounds float* %tmp10288, i64 1
+  %tmp10290 = getelementptr inbounds float* %tmp10289, i64 1
+  %tmp10291 = getelementptr inbounds float* %tmp10290, i64 1
+  %tmp10292 = getelementptr inbounds float* %tmp10291, i64 1
+  %tmp10293 = getelementptr inbounds float* %tmp10292, i64 1
+  %tmp10294 = getelementptr inbounds float* %tmp10293, i64 1
+  %tmp10295 = getelementptr inbounds float* %tmp10294, i64 1
+  %tmp10296 = getelementptr inbounds float* %tmp10295, i64 1
+  %tmp10297 = getelementptr inbounds float* %tmp10296, i64 1
+  %tmp10298 = getelementptr inbounds float* %tmp10297, i64 1
+  %tmp10299 = getelementptr inbounds float* %tmp10298, i64 1
+  %tmp10300 = getelementptr inbounds float* %tmp10299, i64 1
+  %tmp10301 = getelementptr inbounds float* %tmp10300, i64 1
+  %tmp10302 = getelementptr inbounds float* %tmp10301, i64 1
+  %tmp10303 = getelementptr inbounds float* %tmp10302, i64 1
+  %tmp10304 = getelementptr inbounds float* %tmp10303, i64 1
+  %tmp10305 = getelementptr inbounds float* %tmp10304, i64 1
+  %tmp10306 = getelementptr inbounds float* %tmp10305, i64 1
+  %tmp10307 = getelementptr inbounds float* %tmp10306, i64 1
+  %tmp10308 = getelementptr inbounds float* %tmp10307, i64 1
+  %tmp10309 = getelementptr inbounds float* %tmp10308, i64 1
+  %tmp10310 = getelementptr inbounds float* %tmp10309, i64 1
+  %tmp10311 = getelementptr inbounds float* %tmp10310, i64 1
+  %tmp10312 = getelementptr inbounds float* %tmp10311, i64 1
+  %tmp10313 = getelementptr inbounds float* %tmp10312, i64 1
+  %tmp10314 = getelementptr inbounds float* %tmp10313, i64 1
+  %tmp10315 = getelementptr inbounds float* %tmp10314, i64 1
+  %tmp10316 = getelementptr inbounds float* %tmp10315, i64 1
+  %tmp10317 = getelementptr inbounds float* %tmp10316, i64 1
+  %tmp10318 = getelementptr inbounds float* %tmp10317, i64 1
+  %tmp10319 = getelementptr inbounds float* %tmp10318, i64 1
+  %tmp10320 = getelementptr inbounds float* %tmp10319, i64 1
+  %tmp10321 = getelementptr inbounds float* %tmp10320, i64 1
+  %tmp10322 = getelementptr inbounds float* %tmp10321, i64 1
+  %tmp10323 = getelementptr inbounds float* %tmp10322, i64 1
+  %tmp10324 = getelementptr inbounds float* %tmp10323, i64 1
+  %tmp10325 = getelementptr inbounds float* %tmp10324, i64 1
+  %tmp10326 = getelementptr inbounds float* %tmp10325, i64 1
+  %tmp10327 = getelementptr inbounds float* %tmp10326, i64 1
+  %tmp10328 = getelementptr inbounds float* %tmp10327, i64 1
+  %tmp10329 = getelementptr inbounds float* %tmp10328, i64 1
+  %tmp10330 = getelementptr inbounds float* %tmp10329, i64 1
+  %tmp10331 = getelementptr inbounds float* %tmp10330, i64 1
+  %tmp10332 = getelementptr inbounds float* %tmp10331, i64 1
+  %tmp10333 = getelementptr inbounds float* %tmp10332, i64 1
+  %tmp10334 = getelementptr inbounds float* %tmp10333, i64 1
+  %tmp10335 = getelementptr inbounds float* %tmp10334, i64 1
+  %tmp10336 = getelementptr inbounds float* %tmp10335, i64 1
+  %tmp10337 = getelementptr inbounds float* %tmp10336, i64 1
+  %tmp10338 = getelementptr inbounds float* %tmp10337, i64 1
+  %tmp10339 = getelementptr inbounds float* %tmp10338, i64 1
+  %tmp10340 = getelementptr inbounds float* %tmp10339, i64 1
+  %tmp10341 = getelementptr inbounds float* %tmp10340, i64 1
+  %tmp10342 = getelementptr inbounds float* %tmp10341, i64 1
+  %tmp10343 = getelementptr inbounds float* %tmp10342, i64 1
+  %tmp10344 = getelementptr inbounds float* %tmp10343, i64 1
+  %tmp10345 = getelementptr inbounds float* %tmp10344, i64 1
+  %tmp10346 = getelementptr inbounds float* %tmp10345, i64 1
+  %tmp10347 = getelementptr inbounds float* %tmp10346, i64 1
+  %tmp10348 = getelementptr inbounds float* %tmp10347, i64 1
+  %tmp10349 = getelementptr inbounds float* %tmp10348, i64 1
+  %tmp10350 = getelementptr inbounds float* %tmp10349, i64 1
+  %tmp10351 = getelementptr inbounds float* %tmp10350, i64 1
+  %tmp10352 = getelementptr inbounds float* %tmp10351, i64 1
+  %tmp10353 = getelementptr inbounds float* %tmp10352, i64 1
+  %tmp10354 = getelementptr inbounds float* %tmp10353, i64 1
+  %tmp10355 = getelementptr inbounds float* %tmp10354, i64 1
+  %tmp10356 = getelementptr inbounds float* %tmp10355, i64 1
+  %tmp10357 = getelementptr inbounds float* %tmp10356, i64 1
+  %tmp10358 = getelementptr inbounds float* %tmp10357, i64 1
+  %tmp10359 = getelementptr inbounds float* %tmp10358, i64 1
+  %tmp10360 = getelementptr inbounds float* %tmp10359, i64 1
+  %tmp10361 = getelementptr inbounds float* %tmp10360, i64 1
+  %tmp10362 = getelementptr inbounds float* %tmp10361, i64 1
+  %tmp10363 = getelementptr inbounds float* %tmp10362, i64 1
+  %tmp10364 = getelementptr inbounds float* %tmp10363, i64 1
+  %tmp10365 = getelementptr inbounds float* %tmp10364, i64 1
+  %tmp10366 = getelementptr inbounds float* %tmp10365, i64 1
+  %tmp10367 = getelementptr inbounds float* %tmp10366, i64 1
+  %tmp10368 = getelementptr inbounds float* %tmp10367, i64 1
+  %tmp10369 = getelementptr inbounds float* %tmp10368, i64 1
+  %tmp10370 = getelementptr inbounds float* %tmp10369, i64 1
+  %tmp10371 = getelementptr inbounds float* %tmp10370, i64 1
+  %tmp10372 = getelementptr inbounds float* %tmp10371, i64 1
+  %tmp10373 = getelementptr inbounds float* %tmp10372, i64 1
+  %tmp10374 = getelementptr inbounds float* %tmp10373, i64 1
+  %tmp10375 = getelementptr inbounds float* %tmp10374, i64 1
+  %tmp10376 = getelementptr inbounds float* %tmp10375, i64 1
+  %tmp10377 = getelementptr inbounds float* %tmp10376, i64 1
+  %tmp10378 = getelementptr inbounds float* %tmp10377, i64 1
+  %tmp10379 = getelementptr inbounds float* %tmp10378, i64 1
+  %tmp10380 = getelementptr inbounds float* %tmp10379, i64 1
+  %tmp10381 = getelementptr inbounds float* %tmp10380, i64 1
+  %tmp10382 = getelementptr inbounds float* %tmp10381, i64 1
+  %tmp10383 = getelementptr inbounds float* %tmp10382, i64 1
+  %tmp10384 = getelementptr inbounds float* %tmp10383, i64 1
+  %tmp10385 = getelementptr inbounds float* %tmp10384, i64 1
+  %tmp10386 = getelementptr inbounds float* %tmp10385, i64 1
+  %tmp10387 = getelementptr inbounds float* %tmp10386, i64 1
+  %tmp10388 = getelementptr inbounds float* %tmp10387, i64 1
+  %tmp10389 = getelementptr inbounds float* %tmp10388, i64 1
+  %tmp10390 = getelementptr inbounds float* %tmp10389, i64 1
+  %tmp10391 = getelementptr inbounds float* %tmp10390, i64 1
+  %tmp10392 = getelementptr inbounds float* %tmp10391, i64 1
+  %tmp10393 = getelementptr inbounds float* %tmp10392, i64 1
+  %tmp10394 = getelementptr inbounds float* %tmp10393, i64 1
+  %tmp10395 = getelementptr inbounds float* %tmp10394, i64 1
+  %tmp10396 = getelementptr inbounds float* %tmp10395, i64 1
+  %tmp10397 = getelementptr inbounds float* %tmp10396, i64 1
+  %tmp10398 = getelementptr inbounds float* %tmp10397, i64 1
+  %tmp10399 = getelementptr inbounds float* %tmp10398, i64 1
+  %tmp10400 = getelementptr inbounds float* %tmp10399, i64 1
+  %tmp10401 = getelementptr inbounds float* %tmp10400, i64 1
+  %tmp10402 = getelementptr inbounds float* %tmp10401, i64 1
+  %tmp10403 = getelementptr inbounds float* %tmp10402, i64 1
+  %tmp10404 = getelementptr inbounds float* %tmp10403, i64 1
+  %tmp10405 = getelementptr inbounds float* %tmp10404, i64 1
+  %tmp10406 = getelementptr inbounds float* %tmp10405, i64 1
+  %tmp10407 = getelementptr inbounds float* %tmp10406, i64 1
+  %tmp10408 = getelementptr inbounds float* %tmp10407, i64 1
+  %tmp10409 = getelementptr inbounds float* %tmp10408, i64 1
+  %tmp10410 = getelementptr inbounds float* %tmp10409, i64 1
+  %tmp10411 = getelementptr inbounds float* %tmp10410, i64 1
+  %tmp10412 = getelementptr inbounds float* %tmp10411, i64 1
+  %tmp10413 = getelementptr inbounds float* %tmp10412, i64 1
+  %tmp10414 = getelementptr inbounds float* %tmp10413, i64 1
+  %tmp10415 = getelementptr inbounds float* %tmp10414, i64 1
+  %tmp10416 = getelementptr inbounds float* %tmp10415, i64 1
+  %tmp10417 = getelementptr inbounds float* %tmp10416, i64 1
+  %tmp10418 = getelementptr inbounds float* %tmp10417, i64 1
+  %tmp10419 = getelementptr inbounds float* %tmp10418, i64 1
+  %tmp10420 = getelementptr inbounds float* %tmp10419, i64 1
+  %tmp10421 = getelementptr inbounds float* %tmp10420, i64 1
+  %tmp10422 = getelementptr inbounds float* %tmp10421, i64 1
+  %tmp10423 = getelementptr inbounds float* %tmp10422, i64 1
+  %tmp10424 = getelementptr inbounds float* %tmp10423, i64 1
+  %tmp10425 = getelementptr inbounds float* %tmp10424, i64 1
+  %tmp10426 = getelementptr inbounds float* %tmp10425, i64 1
+  %tmp10427 = getelementptr inbounds float* %tmp10426, i64 1
+  %tmp10428 = getelementptr inbounds float* %tmp10427, i64 1
+  %tmp10429 = getelementptr inbounds float* %tmp10428, i64 1
+  %tmp10430 = getelementptr inbounds float* %tmp10429, i64 1
+  %tmp10431 = getelementptr inbounds float* %tmp10430, i64 1
+  %tmp10432 = getelementptr inbounds float* %tmp10431, i64 1
+  %tmp10433 = getelementptr inbounds float* %tmp10432, i64 1
+  %tmp10434 = getelementptr inbounds float* %tmp10433, i64 1
+  %tmp10435 = getelementptr inbounds float* %tmp10434, i64 1
+  %tmp10436 = getelementptr inbounds float* %tmp10435, i64 1
+  %tmp10437 = getelementptr inbounds float* %tmp10436, i64 1
+  %tmp10438 = getelementptr inbounds float* %tmp10437, i64 1
+  %tmp10439 = getelementptr inbounds float* %tmp10438, i64 1
+  %tmp10440 = getelementptr inbounds float* %tmp10439, i64 1
+  %tmp10441 = getelementptr inbounds float* %tmp10440, i64 1
+  %tmp10442 = getelementptr inbounds float* %tmp10441, i64 1
+  %tmp10443 = getelementptr inbounds float* %tmp10442, i64 1
+  %tmp10444 = getelementptr inbounds float* %tmp10443, i64 1
+  %tmp10445 = getelementptr inbounds float* %tmp10444, i64 1
+  %tmp10446 = getelementptr inbounds float* %tmp10445, i64 1
+  %tmp10447 = getelementptr inbounds float* %tmp10446, i64 1
+  %tmp10448 = getelementptr inbounds float* %tmp10447, i64 1
+  %tmp10449 = getelementptr inbounds float* %tmp10448, i64 1
+  %tmp10450 = getelementptr inbounds float* %tmp10449, i64 1
+  %tmp10451 = getelementptr inbounds float* %tmp10450, i64 1
+  %tmp10452 = getelementptr inbounds float* %tmp10451, i64 1
+  %tmp10453 = getelementptr inbounds float* %tmp10452, i64 1
+  %tmp10454 = getelementptr inbounds float* %tmp10453, i64 1
+  %tmp10455 = getelementptr inbounds float* %tmp10454, i64 1
+  %tmp10456 = getelementptr inbounds float* %tmp10455, i64 1
+  %tmp10457 = getelementptr inbounds float* %tmp10456, i64 1
+  %tmp10458 = getelementptr inbounds float* %tmp10457, i64 1
+  %tmp10459 = getelementptr inbounds float* %tmp10458, i64 1
+  %tmp10460 = getelementptr inbounds float* %tmp10459, i64 1
+  %tmp10461 = getelementptr inbounds float* %tmp10460, i64 1
+  %tmp10462 = getelementptr inbounds float* %tmp10461, i64 1
+  %tmp10463 = getelementptr inbounds float* %tmp10462, i64 1
+  %tmp10464 = getelementptr inbounds float* %tmp10463, i64 1
+  %tmp10465 = getelementptr inbounds float* %tmp10464, i64 1
+  %tmp10466 = getelementptr inbounds float* %tmp10465, i64 1
+  %tmp10467 = getelementptr inbounds float* %tmp10466, i64 1
+  %tmp10468 = getelementptr inbounds float* %tmp10467, i64 1
+  %tmp10469 = getelementptr inbounds float* %tmp10468, i64 1
+  %tmp10470 = getelementptr inbounds float* %tmp10469, i64 1
+  %tmp10471 = getelementptr inbounds float* %tmp10470, i64 1
+  %tmp10472 = getelementptr inbounds float* %tmp10471, i64 1
+  %tmp10473 = getelementptr inbounds float* %tmp10472, i64 1
+  %tmp10474 = getelementptr inbounds float* %tmp10473, i64 1
+  %tmp10475 = getelementptr inbounds float* %tmp10474, i64 1
+  %tmp10476 = getelementptr inbounds float* %tmp10475, i64 1
+  %tmp10477 = getelementptr inbounds float* %tmp10476, i64 1
+  %tmp10478 = getelementptr inbounds float* %tmp10477, i64 1
+  %tmp10479 = getelementptr inbounds float* %tmp10478, i64 1
+  %tmp10480 = getelementptr inbounds float* %tmp10479, i64 1
+  %tmp10481 = getelementptr inbounds float* %tmp10480, i64 1
+  %tmp10482 = getelementptr inbounds float* %tmp10481, i64 1
+  %tmp10483 = getelementptr inbounds float* %tmp10482, i64 1
+  %tmp10484 = getelementptr inbounds float* %tmp10483, i64 1
+  %tmp10485 = getelementptr inbounds float* %tmp10484, i64 1
+  %tmp10486 = getelementptr inbounds float* %tmp10485, i64 1
+  %tmp10487 = getelementptr inbounds float* %tmp10486, i64 1
+  %tmp10488 = getelementptr inbounds float* %tmp10487, i64 1
+  %tmp10489 = getelementptr inbounds float* %tmp10488, i64 1
+  %tmp10490 = getelementptr inbounds float* %tmp10489, i64 1
+  %tmp10491 = getelementptr inbounds float* %tmp10490, i64 1
+  %tmp10492 = getelementptr inbounds float* %tmp10491, i64 1
+  %tmp10493 = getelementptr inbounds float* %tmp10492, i64 1
+  %tmp10494 = getelementptr inbounds float* %tmp10493, i64 1
+  %tmp10495 = getelementptr inbounds float* %tmp10494, i64 1
+  %tmp10496 = getelementptr inbounds float* %tmp10495, i64 1
+  %tmp10497 = getelementptr inbounds float* %tmp10496, i64 1
+  %tmp10498 = getelementptr inbounds float* %tmp10497, i64 1
+  %tmp10499 = getelementptr inbounds float* %tmp10498, i64 1
+  %tmp10500 = getelementptr inbounds float* %tmp10499, i64 1
+  %tmp10501 = getelementptr inbounds float* %tmp10500, i64 1
+  %tmp10502 = getelementptr inbounds float* %tmp10501, i64 1
+  %tmp10503 = getelementptr inbounds float* %tmp10502, i64 1
+  %tmp10504 = getelementptr inbounds float* %tmp10503, i64 1
+  %tmp10505 = getelementptr inbounds float* %tmp10504, i64 1
+  %tmp10506 = getelementptr inbounds float* %tmp10505, i64 1
+  %tmp10507 = getelementptr inbounds float* %tmp10506, i64 1
+  %tmp10508 = getelementptr inbounds float* %tmp10507, i64 1
+  %tmp10509 = getelementptr inbounds float* %tmp10508, i64 1
+  %tmp10510 = getelementptr inbounds float* %tmp10509, i64 1
+  %tmp10511 = getelementptr inbounds float* %tmp10510, i64 1
+  %tmp10512 = getelementptr inbounds float* %tmp10511, i64 1
+  %tmp10513 = getelementptr inbounds float* %tmp10512, i64 1
+  %tmp10514 = getelementptr inbounds float* %tmp10513, i64 1
+  %tmp10515 = getelementptr inbounds float* %tmp10514, i64 1
+  %tmp10516 = getelementptr inbounds float* %tmp10515, i64 1
+  %tmp10517 = getelementptr inbounds float* %tmp10516, i64 1
+  %tmp10518 = getelementptr inbounds float* %tmp10517, i64 1
+  %tmp10519 = getelementptr inbounds float* %tmp10518, i64 1
+  %tmp10520 = getelementptr inbounds float* %tmp10519, i64 1
+  %tmp10521 = getelementptr inbounds float* %tmp10520, i64 1
+  %tmp10522 = getelementptr inbounds float* %tmp10521, i64 1
+  %tmp10523 = getelementptr inbounds float* %tmp10522, i64 1
+  %tmp10524 = getelementptr inbounds float* %tmp10523, i64 1
+  %tmp10525 = getelementptr inbounds float* %tmp10524, i64 1
+  %tmp10526 = getelementptr inbounds float* %tmp10525, i64 1
+  %tmp10527 = getelementptr inbounds float* %tmp10526, i64 1
+  %tmp10528 = getelementptr inbounds float* %tmp10527, i64 1
+  %tmp10529 = getelementptr inbounds float* %tmp10528, i64 1
+  %tmp10530 = getelementptr inbounds float* %tmp10529, i64 1
+  %tmp10531 = getelementptr inbounds float* %tmp10530, i64 1
+  %tmp10532 = getelementptr inbounds float* %tmp10531, i64 1
+  %tmp10533 = getelementptr inbounds float* %tmp10532, i64 1
+  %tmp10534 = getelementptr inbounds float* %tmp10533, i64 1
+  %tmp10535 = getelementptr inbounds float* %tmp10534, i64 1
+  %tmp10536 = getelementptr inbounds float* %tmp10535, i64 1
+  %tmp10537 = getelementptr inbounds float* %tmp10536, i64 1
+  %tmp10538 = getelementptr inbounds float* %tmp10537, i64 1
+  %tmp10539 = getelementptr inbounds float* %tmp10538, i64 1
+  %tmp10540 = getelementptr inbounds float* %tmp10539, i64 1
+  %tmp10541 = getelementptr inbounds float* %tmp10540, i64 1
+  %tmp10542 = getelementptr inbounds float* %tmp10541, i64 1
+  %tmp10543 = getelementptr inbounds float* %tmp10542, i64 1
+  %tmp10544 = getelementptr inbounds float* %tmp10543, i64 1
+  %tmp10545 = getelementptr inbounds float* %tmp10544, i64 1
+  %tmp10546 = getelementptr inbounds float* %tmp10545, i64 1
+  %tmp10547 = getelementptr inbounds float* %tmp10546, i64 1
+  %tmp10548 = getelementptr inbounds float* %tmp10547, i64 1
+  %tmp10549 = getelementptr inbounds float* %tmp10548, i64 1
+  %tmp10550 = getelementptr inbounds float* %tmp10549, i64 1
+  %tmp10551 = getelementptr inbounds float* %tmp10550, i64 1
+  %tmp10552 = getelementptr inbounds float* %tmp10551, i64 1
+  %tmp10553 = getelementptr inbounds float* %tmp10552, i64 1
+  %tmp10554 = getelementptr inbounds float* %tmp10553, i64 1
+  %tmp10555 = getelementptr inbounds float* %tmp10554, i64 1
+  %tmp10556 = getelementptr inbounds float* %tmp10555, i64 1
+  %tmp10557 = getelementptr inbounds float* %tmp10556, i64 1
+  %tmp10558 = getelementptr inbounds float* %tmp10557, i64 1
+  %tmp10559 = getelementptr inbounds float* %tmp10558, i64 1
+  %tmp10560 = getelementptr inbounds float* %tmp10559, i64 1
+  %tmp10561 = getelementptr inbounds float* %tmp10560, i64 1
+  %tmp10562 = getelementptr inbounds float* %tmp10561, i64 1
+  %tmp10563 = getelementptr inbounds float* %tmp10562, i64 1
+  %tmp10564 = getelementptr inbounds float* %tmp10563, i64 1
+  %tmp10565 = getelementptr inbounds float* %tmp10564, i64 1
+  %tmp10566 = getelementptr inbounds float* %tmp10565, i64 1
+  %tmp10567 = getelementptr inbounds float* %tmp10566, i64 1
+  %tmp10568 = getelementptr inbounds float* %tmp10567, i64 1
+  %tmp10569 = getelementptr inbounds float* %tmp10568, i64 1
+  %tmp10570 = getelementptr inbounds float* %tmp10569, i64 1
+  %tmp10571 = getelementptr inbounds float* %tmp10570, i64 1
+  %tmp10572 = getelementptr inbounds float* %tmp10571, i64 1
+  %tmp10573 = getelementptr inbounds float* %tmp10572, i64 1
+  %tmp10574 = getelementptr inbounds float* %tmp10573, i64 1
+  %tmp10575 = getelementptr inbounds float* %tmp10574, i64 1
+  %tmp10576 = getelementptr inbounds float* %tmp10575, i64 1
+  %tmp10577 = getelementptr inbounds float* %tmp10576, i64 1
+  %tmp10578 = getelementptr inbounds float* %tmp10577, i64 1
+  %tmp10579 = getelementptr inbounds float* %tmp10578, i64 1
+  %tmp10580 = getelementptr inbounds float* %tmp10579, i64 1
+  %tmp10581 = getelementptr inbounds float* %tmp10580, i64 1
+  %tmp10582 = getelementptr inbounds float* %tmp10581, i64 1
+  %tmp10583 = getelementptr inbounds float* %tmp10582, i64 1
+  %tmp10584 = getelementptr inbounds float* %tmp10583, i64 1
+  %tmp10585 = getelementptr inbounds float* %tmp10584, i64 1
+  %tmp10586 = getelementptr inbounds float* %tmp10585, i64 1
+  %tmp10587 = getelementptr inbounds float* %tmp10586, i64 1
+  %tmp10588 = getelementptr inbounds float* %tmp10587, i64 1
+  %tmp10589 = getelementptr inbounds float* %tmp10588, i64 1
+  %tmp10590 = getelementptr inbounds float* %tmp10589, i64 1
+  %tmp10591 = getelementptr inbounds float* %tmp10590, i64 1
+  %tmp10592 = getelementptr inbounds float* %tmp10591, i64 1
+  %tmp10593 = getelementptr inbounds float* %tmp10592, i64 1
+  %tmp10594 = getelementptr inbounds float* %tmp10593, i64 1
+  %tmp10595 = getelementptr inbounds float* %tmp10594, i64 1
+  %tmp10596 = getelementptr inbounds float* %tmp10595, i64 1
+  %tmp10597 = getelementptr inbounds float* %tmp10596, i64 1
+  %tmp10598 = getelementptr inbounds float* %tmp10597, i64 1
+  %tmp10599 = getelementptr inbounds float* %tmp10598, i64 1
+  %tmp10600 = getelementptr inbounds float* %tmp10599, i64 1
+  %tmp10601 = getelementptr inbounds float* %tmp10600, i64 1
+  %tmp10602 = getelementptr inbounds float* %tmp10601, i64 1
+  %tmp10603 = getelementptr inbounds float* %tmp10602, i64 1
+  %tmp10604 = getelementptr inbounds float* %tmp10603, i64 1
+  %tmp10605 = getelementptr inbounds float* %tmp10604, i64 1
+  %tmp10606 = getelementptr inbounds float* %tmp10605, i64 1
+  %tmp10607 = getelementptr inbounds float* %tmp10606, i64 1
+  %tmp10608 = getelementptr inbounds float* %tmp10607, i64 1
+  %tmp10609 = getelementptr inbounds float* %tmp10608, i64 1
+  %tmp10610 = getelementptr inbounds float* %tmp10609, i64 1
+  %tmp10611 = getelementptr inbounds float* %tmp10610, i64 1
+  %tmp10612 = getelementptr inbounds float* %tmp10611, i64 1
+  %tmp10613 = getelementptr inbounds float* %tmp10612, i64 1
+  %tmp10614 = getelementptr inbounds float* %tmp10613, i64 1
+  %tmp10615 = getelementptr inbounds float* %tmp10614, i64 1
+  %tmp10616 = getelementptr inbounds float* %tmp10615, i64 1
+  %tmp10617 = getelementptr inbounds float* %tmp10616, i64 1
+  %tmp10618 = getelementptr inbounds float* %tmp10617, i64 1
+  %tmp10619 = getelementptr inbounds float* %tmp10618, i64 1
+  %tmp10620 = getelementptr inbounds float* %tmp10619, i64 1
+  %tmp10621 = getelementptr inbounds float* %tmp10620, i64 1
+  %tmp10622 = getelementptr inbounds float* %tmp10621, i64 1
+  %tmp10623 = getelementptr inbounds float* %tmp10622, i64 1
+  %tmp10624 = getelementptr inbounds float* %tmp10623, i64 1
+  %tmp10625 = getelementptr inbounds float* %tmp10624, i64 1
+  %tmp10626 = getelementptr inbounds float* %tmp10625, i64 1
+  %tmp10627 = getelementptr inbounds float* %tmp10626, i64 1
+  %tmp10628 = getelementptr inbounds float* %tmp10627, i64 1
+  %tmp10629 = getelementptr inbounds float* %tmp10628, i64 1
+  %tmp10630 = getelementptr inbounds float* %tmp10629, i64 1
+  %tmp10631 = getelementptr inbounds float* %tmp10630, i64 1
+  %tmp10632 = getelementptr inbounds float* %tmp10631, i64 1
+  %tmp10633 = getelementptr inbounds float* %tmp10632, i64 1
+  %tmp10634 = getelementptr inbounds float* %tmp10633, i64 1
+  %tmp10635 = getelementptr inbounds float* %tmp10634, i64 1
+  %tmp10636 = getelementptr inbounds float* %tmp10635, i64 1
+  %tmp10637 = getelementptr inbounds float* %tmp10636, i64 1
+  %tmp10638 = getelementptr inbounds float* %tmp10637, i64 1
+  %tmp10639 = getelementptr inbounds float* %tmp10638, i64 1
+  %tmp10640 = getelementptr inbounds float* %tmp10639, i64 1
+  %tmp10641 = getelementptr inbounds float* %tmp10640, i64 1
+  %tmp10642 = getelementptr inbounds float* %tmp10641, i64 1
+  %tmp10643 = getelementptr inbounds float* %tmp10642, i64 1
+  %tmp10644 = getelementptr inbounds float* %tmp10643, i64 1
+  %tmp10645 = getelementptr inbounds float* %tmp10644, i64 1
+  %tmp10646 = getelementptr inbounds float* %tmp10645, i64 1
+  %tmp10647 = getelementptr inbounds float* %tmp10646, i64 1
+  %tmp10648 = getelementptr inbounds float* %tmp10647, i64 1
+  %tmp10649 = getelementptr inbounds float* %tmp10648, i64 1
+  %tmp10650 = getelementptr inbounds float* %tmp10649, i64 1
+  %tmp10651 = getelementptr inbounds float* %tmp10650, i64 1
+  %tmp10652 = getelementptr inbounds float* %tmp10651, i64 1
+  %tmp10653 = getelementptr inbounds float* %tmp10652, i64 1
+  %tmp10654 = getelementptr inbounds float* %tmp10653, i64 1
+  %tmp10655 = getelementptr inbounds float* %tmp10654, i64 1
+  %tmp10656 = getelementptr inbounds float* %tmp10655, i64 1
+  %tmp10657 = getelementptr inbounds float* %tmp10656, i64 1
+  %tmp10658 = getelementptr inbounds float* %tmp10657, i64 1
+  %tmp10659 = getelementptr inbounds float* %tmp10658, i64 1
+  %tmp10660 = getelementptr inbounds float* %tmp10659, i64 1
+  %tmp10661 = getelementptr inbounds float* %tmp10660, i64 1
+  %tmp10662 = getelementptr inbounds float* %tmp10661, i64 1
+  %tmp10663 = getelementptr inbounds float* %tmp10662, i64 1
+  %tmp10664 = getelementptr inbounds float* %tmp10663, i64 1
+  %tmp10665 = getelementptr inbounds float* %tmp10664, i64 1
+  %tmp10666 = getelementptr inbounds float* %tmp10665, i64 1
+  %tmp10667 = getelementptr inbounds float* %tmp10666, i64 1
+  %tmp10668 = getelementptr inbounds float* %tmp10667, i64 1
+  %tmp10669 = getelementptr inbounds float* %tmp10668, i64 1
+  %tmp10670 = getelementptr inbounds float* %tmp10669, i64 1
+  %tmp10671 = getelementptr inbounds float* %tmp10670, i64 1
+  %tmp10672 = getelementptr inbounds float* %tmp10671, i64 1
+  %tmp10673 = getelementptr inbounds float* %tmp10672, i64 1
+  %tmp10674 = getelementptr inbounds float* %tmp10673, i64 1
+  %tmp10675 = getelementptr inbounds float* %tmp10674, i64 1
+  %tmp10676 = getelementptr inbounds float* %tmp10675, i64 1
+  %tmp10677 = getelementptr inbounds float* %tmp10676, i64 1
+  %tmp10678 = getelementptr inbounds float* %tmp10677, i64 1
+  %tmp10679 = getelementptr inbounds float* %tmp10678, i64 1
+  %tmp10680 = getelementptr inbounds float* %tmp10679, i64 1
+  %tmp10681 = getelementptr inbounds float* %tmp10680, i64 1
+  %tmp10682 = getelementptr inbounds float* %tmp10681, i64 1
+  %tmp10683 = getelementptr inbounds float* %tmp10682, i64 1
+  %tmp10684 = getelementptr inbounds float* %tmp10683, i64 1
+  %tmp10685 = getelementptr inbounds float* %tmp10684, i64 1
+  %tmp10686 = getelementptr inbounds float* %tmp10685, i64 1
+  %tmp10687 = getelementptr inbounds float* %tmp10686, i64 1
+  %tmp10688 = getelementptr inbounds float* %tmp10687, i64 1
+  %tmp10689 = getelementptr inbounds float* %tmp10688, i64 1
+  %tmp10690 = getelementptr inbounds float* %tmp10689, i64 1
+  %tmp10691 = getelementptr inbounds float* %tmp10690, i64 1
+  %tmp10692 = getelementptr inbounds float* %tmp10691, i64 1
+  %tmp10693 = getelementptr inbounds float* %tmp10692, i64 1
+  %tmp10694 = getelementptr inbounds float* %tmp10693, i64 1
+  %tmp10695 = getelementptr inbounds float* %tmp10694, i64 1
+  %tmp10696 = getelementptr inbounds float* %tmp10695, i64 1
+  %tmp10697 = getelementptr inbounds float* %tmp10696, i64 1
+  %tmp10698 = getelementptr inbounds float* %tmp10697, i64 1
+  %tmp10699 = getelementptr inbounds float* %tmp10698, i64 1
+  %tmp10700 = getelementptr inbounds float* %tmp10699, i64 1
+  %tmp10701 = getelementptr inbounds float* %tmp10700, i64 1
+  %tmp10702 = getelementptr inbounds float* %tmp10701, i64 1
+  %tmp10703 = getelementptr inbounds float* %tmp10702, i64 1
+  %tmp10704 = getelementptr inbounds float* %tmp10703, i64 1
+  %tmp10705 = getelementptr inbounds float* %tmp10704, i64 1
+  %tmp10706 = getelementptr inbounds float* %tmp10705, i64 1
+  %tmp10707 = getelementptr inbounds float* %tmp10706, i64 1
+  %tmp10708 = getelementptr inbounds float* %tmp10707, i64 1
+  %tmp10709 = getelementptr inbounds float* %tmp10708, i64 1
+  %tmp10710 = getelementptr inbounds float* %tmp10709, i64 1
+  %tmp10711 = getelementptr inbounds float* %tmp10710, i64 1
+  %tmp10712 = getelementptr inbounds float* %tmp10711, i64 1
+  %tmp10713 = getelementptr inbounds float* %tmp10712, i64 1
+  %tmp10714 = getelementptr inbounds float* %tmp10713, i64 1
+  %tmp10715 = getelementptr inbounds float* %tmp10714, i64 1
+  %tmp10716 = getelementptr inbounds float* %tmp10715, i64 1
+  %tmp10717 = getelementptr inbounds float* %tmp10716, i64 1
+  %tmp10718 = getelementptr inbounds float* %tmp10717, i64 1
+  %tmp10719 = getelementptr inbounds float* %tmp10718, i64 1
+  %tmp10720 = getelementptr inbounds float* %tmp10719, i64 1
+  %tmp10721 = getelementptr inbounds float* %tmp10720, i64 1
+  %tmp10722 = getelementptr inbounds float* %tmp10721, i64 1
+  %tmp10723 = getelementptr inbounds float* %tmp10722, i64 1
+  %tmp10724 = getelementptr inbounds float* %tmp10723, i64 1
+  %tmp10725 = getelementptr inbounds float* %tmp10724, i64 1
+  %tmp10726 = getelementptr inbounds float* %tmp10725, i64 1
+  %tmp10727 = getelementptr inbounds float* %tmp10726, i64 1
+  %tmp10728 = getelementptr inbounds float* %tmp10727, i64 1
+  %tmp10729 = getelementptr inbounds float* %tmp10728, i64 1
+  %tmp10730 = getelementptr inbounds float* %tmp10729, i64 1
+  %tmp10731 = getelementptr inbounds float* %tmp10730, i64 1
+  %tmp10732 = getelementptr inbounds float* %tmp10731, i64 1
+  %tmp10733 = getelementptr inbounds float* %tmp10732, i64 1
+  %tmp10734 = getelementptr inbounds float* %tmp10733, i64 1
+  %tmp10735 = getelementptr inbounds float* %tmp10734, i64 1
+  %tmp10736 = getelementptr inbounds float* %tmp10735, i64 1
+  %tmp10737 = getelementptr inbounds float* %tmp10736, i64 1
+  %tmp10738 = getelementptr inbounds float* %tmp10737, i64 1
+  %tmp10739 = getelementptr inbounds float* %tmp10738, i64 1
+  %tmp10740 = getelementptr inbounds float* %tmp10739, i64 1
+  %tmp10741 = getelementptr inbounds float* %tmp10740, i64 1
+  %tmp10742 = getelementptr inbounds float* %tmp10741, i64 1
+  %tmp10743 = getelementptr inbounds float* %tmp10742, i64 1
+  %tmp10744 = getelementptr inbounds float* %tmp10743, i64 1
+  %tmp10745 = getelementptr inbounds float* %tmp10744, i64 1
+  %tmp10746 = getelementptr inbounds float* %tmp10745, i64 1
+  %tmp10747 = getelementptr inbounds float* %tmp10746, i64 1
+  %tmp10748 = getelementptr inbounds float* %tmp10747, i64 1
+  %tmp10749 = getelementptr inbounds float* %tmp10748, i64 1
+  %tmp10750 = getelementptr inbounds float* %tmp10749, i64 1
+  %tmp10751 = getelementptr inbounds float* %tmp10750, i64 1
+  %tmp10752 = getelementptr inbounds float* %tmp10751, i64 1
+  %tmp10753 = getelementptr inbounds float* %tmp10752, i64 1
+  %tmp10754 = getelementptr inbounds float* %tmp10753, i64 1
+  %tmp10755 = getelementptr inbounds float* %tmp10754, i64 1
+  %tmp10756 = getelementptr inbounds float* %tmp10755, i64 1
+  %tmp10757 = getelementptr inbounds float* %tmp10756, i64 1
+  %tmp10758 = getelementptr inbounds float* %tmp10757, i64 1
+  %tmp10759 = getelementptr inbounds float* %tmp10758, i64 1
+  %tmp10760 = getelementptr inbounds float* %tmp10759, i64 1
+  %tmp10761 = getelementptr inbounds float* %tmp10760, i64 1
+  %tmp10762 = getelementptr inbounds float* %tmp10761, i64 1
+  %tmp10763 = getelementptr inbounds float* %tmp10762, i64 1
+  %tmp10764 = getelementptr inbounds float* %tmp10763, i64 1
+  %tmp10765 = getelementptr inbounds float* %tmp10764, i64 1
+  %tmp10766 = getelementptr inbounds float* %tmp10765, i64 1
+  %tmp10767 = getelementptr inbounds float* %tmp10766, i64 1
+  %tmp10768 = getelementptr inbounds float* %tmp10767, i64 1
+  %tmp10769 = getelementptr inbounds float* %tmp10768, i64 1
+  %tmp10770 = getelementptr inbounds float* %tmp10769, i64 1
+  %tmp10771 = getelementptr inbounds float* %tmp10770, i64 1
+  %tmp10772 = getelementptr inbounds float* %tmp10771, i64 1
+  %tmp10773 = getelementptr inbounds float* %tmp10772, i64 1
+  %tmp10774 = getelementptr inbounds float* %tmp10773, i64 1
+  %tmp10775 = getelementptr inbounds float* %tmp10774, i64 1
+  %tmp10776 = getelementptr inbounds float* %tmp10775, i64 1
+  %tmp10777 = getelementptr inbounds float* %tmp10776, i64 1
+  %tmp10778 = getelementptr inbounds float* %tmp10777, i64 1
+  %tmp10779 = getelementptr inbounds float* %tmp10778, i64 1
+  %tmp10780 = getelementptr inbounds float* %tmp10779, i64 1
+  %tmp10781 = getelementptr inbounds float* %tmp10780, i64 1
+  %tmp10782 = getelementptr inbounds float* %tmp10781, i64 1
+  %tmp10783 = getelementptr inbounds float* %tmp10782, i64 1
+  %tmp10784 = getelementptr inbounds float* %tmp10783, i64 1
+  %tmp10785 = getelementptr inbounds float* %tmp10784, i64 1
+  %tmp10786 = getelementptr inbounds float* %tmp10785, i64 1
+  %tmp10787 = getelementptr inbounds float* %tmp10786, i64 1
+  %tmp10788 = getelementptr inbounds float* %tmp10787, i64 1
+  %tmp10789 = getelementptr inbounds float* %tmp10788, i64 1
+  %tmp10790 = getelementptr inbounds float* %tmp10789, i64 1
+  %tmp10791 = getelementptr inbounds float* %tmp10790, i64 1
+  %tmp10792 = getelementptr inbounds float* %tmp10791, i64 1
+  %tmp10793 = getelementptr inbounds float* %tmp10792, i64 1
+  %tmp10794 = getelementptr inbounds float* %tmp10793, i64 1
+  %tmp10795 = getelementptr inbounds float* %tmp10794, i64 1
+  %tmp10796 = getelementptr inbounds float* %tmp10795, i64 1
+  %tmp10797 = getelementptr inbounds float* %tmp10796, i64 1
+  %tmp10798 = getelementptr inbounds float* %tmp10797, i64 1
+  %tmp10799 = getelementptr inbounds float* %tmp10798, i64 1
+  %tmp10800 = getelementptr inbounds float* %tmp10799, i64 1
+  %tmp10801 = getelementptr inbounds float* %tmp10800, i64 1
+  %tmp10802 = getelementptr inbounds float* %tmp10801, i64 1
+  %tmp10803 = getelementptr inbounds float* %tmp10802, i64 1
+  %tmp10804 = getelementptr inbounds float* %tmp10803, i64 1
+  %tmp10805 = getelementptr inbounds float* %tmp10804, i64 1
+  %tmp10806 = getelementptr inbounds float* %tmp10805, i64 1
+  %tmp10807 = getelementptr inbounds float* %tmp10806, i64 1
+  %tmp10808 = getelementptr inbounds float* %tmp10807, i64 1
+  %tmp10809 = getelementptr inbounds float* %tmp10808, i64 1
+  %tmp10810 = getelementptr inbounds float* %tmp10809, i64 1
+  %tmp10811 = getelementptr inbounds float* %tmp10810, i64 1
+  %tmp10812 = getelementptr inbounds float* %tmp10811, i64 1
+  %tmp10813 = getelementptr inbounds float* %tmp10812, i64 1
+  %tmp10814 = getelementptr inbounds float* %tmp10813, i64 1
+  %tmp10815 = getelementptr inbounds float* %tmp10814, i64 1
+  %tmp10816 = getelementptr inbounds float* %tmp10815, i64 1
+  %tmp10817 = getelementptr inbounds float* %tmp10816, i64 1
+  %tmp10818 = getelementptr inbounds float* %tmp10817, i64 1
+  %tmp10819 = getelementptr inbounds float* %tmp10818, i64 1
+  %tmp10820 = getelementptr inbounds float* %tmp10819, i64 1
+  %tmp10821 = getelementptr inbounds float* %tmp10820, i64 1
+  %tmp10822 = getelementptr inbounds float* %tmp10821, i64 1
+  %tmp10823 = getelementptr inbounds float* %tmp10822, i64 1
+  %tmp10824 = getelementptr inbounds float* %tmp10823, i64 1
+  %tmp10825 = getelementptr inbounds float* %tmp10824, i64 1
+  %tmp10826 = getelementptr inbounds float* %tmp10825, i64 1
+  %tmp10827 = getelementptr inbounds float* %tmp10826, i64 1
+  %tmp10828 = getelementptr inbounds float* %tmp10827, i64 1
+  %tmp10829 = getelementptr inbounds float* %tmp10828, i64 1
+  %tmp10830 = getelementptr inbounds float* %tmp10829, i64 1
+  %tmp10831 = getelementptr inbounds float* %tmp10830, i64 1
+  %tmp10832 = getelementptr inbounds float* %tmp10831, i64 1
+  %tmp10833 = getelementptr inbounds float* %tmp10832, i64 1
+  %tmp10834 = getelementptr inbounds float* %tmp10833, i64 1
+  %tmp10835 = getelementptr inbounds float* %tmp10834, i64 1
+  %tmp10836 = getelementptr inbounds float* %tmp10835, i64 1
+  %tmp10837 = getelementptr inbounds float* %tmp10836, i64 1
+  %tmp10838 = getelementptr inbounds float* %tmp10837, i64 1
+  %tmp10839 = getelementptr inbounds float* %tmp10838, i64 1
+  %tmp10840 = getelementptr inbounds float* %tmp10839, i64 1
+  %tmp10841 = getelementptr inbounds float* %tmp10840, i64 1
+  %tmp10842 = getelementptr inbounds float* %tmp10841, i64 1
+  %tmp10843 = getelementptr inbounds float* %tmp10842, i64 1
+  %tmp10844 = getelementptr inbounds float* %tmp10843, i64 1
+  %tmp10845 = getelementptr inbounds float* %tmp10844, i64 1
+  %tmp10846 = getelementptr inbounds float* %tmp10845, i64 1
+  %tmp10847 = getelementptr inbounds float* %tmp10846, i64 1
+  %tmp10848 = getelementptr inbounds float* %tmp10847, i64 1
+  %tmp10849 = getelementptr inbounds float* %tmp10848, i64 1
+  %tmp10850 = getelementptr inbounds float* %tmp10849, i64 1
+  %tmp10851 = getelementptr inbounds float* %tmp10850, i64 1
+  %tmp10852 = getelementptr inbounds float* %tmp10851, i64 1
+  %tmp10853 = getelementptr inbounds float* %tmp10852, i64 1
+  %tmp10854 = getelementptr inbounds float* %tmp10853, i64 1
+  %tmp10855 = getelementptr inbounds float* %tmp10854, i64 1
+  %tmp10856 = getelementptr inbounds float* %tmp10855, i64 1
+  %tmp10857 = getelementptr inbounds float* %tmp10856, i64 1
+  %tmp10858 = getelementptr inbounds float* %tmp10857, i64 1
+  %tmp10859 = getelementptr inbounds float* %tmp10858, i64 1
+  %tmp10860 = getelementptr inbounds float* %tmp10859, i64 1
+  %tmp10861 = getelementptr inbounds float* %tmp10860, i64 1
+  %tmp10862 = getelementptr inbounds float* %tmp10861, i64 1
+  %tmp10863 = getelementptr inbounds float* %tmp10862, i64 1
+  %tmp10864 = getelementptr inbounds float* %tmp10863, i64 1
+  %tmp10865 = getelementptr inbounds float* %tmp10864, i64 1
+  %tmp10866 = getelementptr inbounds float* %tmp10865, i64 1
+  %tmp10867 = getelementptr inbounds float* %tmp10866, i64 1
+  %tmp10868 = getelementptr inbounds float* %tmp10867, i64 1
+  %tmp10869 = getelementptr inbounds float* %tmp10868, i64 1
+  %tmp10870 = getelementptr inbounds float* %tmp10869, i64 1
+  %tmp10871 = getelementptr inbounds float* %tmp10870, i64 1
+  %tmp10872 = getelementptr inbounds float* %tmp10871, i64 1
+  %tmp10873 = getelementptr inbounds float* %tmp10872, i64 1
+  %tmp10874 = getelementptr inbounds float* %tmp10873, i64 1
+  %tmp10875 = getelementptr inbounds float* %tmp10874, i64 1
+  %tmp10876 = getelementptr inbounds float* %tmp10875, i64 1
+  %tmp10877 = getelementptr inbounds float* %tmp10876, i64 1
+  %tmp10878 = getelementptr inbounds float* %tmp10877, i64 1
+  %tmp10879 = getelementptr inbounds float* %tmp10878, i64 1
+  %tmp10880 = getelementptr inbounds float* %tmp10879, i64 1
+  %tmp10881 = getelementptr inbounds float* %tmp10880, i64 1
+  %tmp10882 = getelementptr inbounds float* %tmp10881, i64 1
+  %tmp10883 = getelementptr inbounds float* %tmp10882, i64 1
+  %tmp10884 = getelementptr inbounds float* %tmp10883, i64 1
+  %tmp10885 = getelementptr inbounds float* %tmp10884, i64 1
+  %tmp10886 = getelementptr inbounds float* %tmp10885, i64 1
+  %tmp10887 = getelementptr inbounds float* %tmp10886, i64 1
+  %tmp10888 = getelementptr inbounds float* %tmp10887, i64 1
+  %tmp10889 = getelementptr inbounds float* %tmp10888, i64 1
+  %tmp10890 = getelementptr inbounds float* %tmp10889, i64 1
+  %tmp10891 = getelementptr inbounds float* %tmp10890, i64 1
+  %tmp10892 = getelementptr inbounds float* %tmp10891, i64 1
+  %tmp10893 = getelementptr inbounds float* %tmp10892, i64 1
+  %tmp10894 = getelementptr inbounds float* %tmp10893, i64 1
+  %tmp10895 = getelementptr inbounds float* %tmp10894, i64 1
+  %tmp10896 = getelementptr inbounds float* %tmp10895, i64 1
+  %tmp10897 = getelementptr inbounds float* %tmp10896, i64 1
+  %tmp10898 = getelementptr inbounds float* %tmp10897, i64 1
+  %tmp10899 = getelementptr inbounds float* %tmp10898, i64 1
+  %tmp10900 = getelementptr inbounds float* %tmp10899, i64 1
+  %tmp10901 = getelementptr inbounds float* %tmp10900, i64 1
+  %tmp10902 = getelementptr inbounds float* %tmp10901, i64 1
+  %tmp10903 = getelementptr inbounds float* %tmp10902, i64 1
+  %tmp10904 = getelementptr inbounds float* %tmp10903, i64 1
+  %tmp10905 = getelementptr inbounds float* %tmp10904, i64 1
+  %tmp10906 = getelementptr inbounds float* %tmp10905, i64 1
+  %tmp10907 = getelementptr inbounds float* %tmp10906, i64 1
+  %tmp10908 = getelementptr inbounds float* %tmp10907, i64 1
+  %tmp10909 = getelementptr inbounds float* %tmp10908, i64 1
+  %tmp10910 = getelementptr inbounds float* %tmp10909, i64 1
+  %tmp10911 = getelementptr inbounds float* %tmp10910, i64 1
+  %tmp10912 = getelementptr inbounds float* %tmp10911, i64 1
+  %tmp10913 = getelementptr inbounds float* %tmp10912, i64 1
+  %tmp10914 = getelementptr inbounds float* %tmp10913, i64 1
+  %tmp10915 = getelementptr inbounds float* %tmp10914, i64 1
+  %tmp10916 = getelementptr inbounds float* %tmp10915, i64 1
+  %tmp10917 = getelementptr inbounds float* %tmp10916, i64 1
+  %tmp10918 = getelementptr inbounds float* %tmp10917, i64 1
+  %tmp10919 = getelementptr inbounds float* %tmp10918, i64 1
+  %tmp10920 = getelementptr inbounds float* %tmp10919, i64 1
+  %tmp10921 = getelementptr inbounds float* %tmp10920, i64 1
+  %tmp10922 = getelementptr inbounds float* %tmp10921, i64 1
+  %tmp10923 = getelementptr inbounds float* %tmp10922, i64 1
+  %tmp10924 = getelementptr inbounds float* %tmp10923, i64 1
+  %tmp10925 = getelementptr inbounds float* %tmp10924, i64 1
+  %tmp10926 = getelementptr inbounds float* %tmp10925, i64 1
+  %tmp10927 = getelementptr inbounds float* %tmp10926, i64 1
+  %tmp10928 = getelementptr inbounds float* %tmp10927, i64 1
+  %tmp10929 = getelementptr inbounds float* %tmp10928, i64 1
+  %tmp10930 = getelementptr inbounds float* %tmp10929, i64 1
+  %tmp10931 = getelementptr inbounds float* %tmp10930, i64 1
+  %tmp10932 = getelementptr inbounds float* %tmp10931, i64 1
+  %tmp10933 = getelementptr inbounds float* %tmp10932, i64 1
+  %tmp10934 = getelementptr inbounds float* %tmp10933, i64 1
+  %tmp10935 = getelementptr inbounds float* %tmp10934, i64 1
+  %tmp10936 = getelementptr inbounds float* %tmp10935, i64 1
+  %tmp10937 = getelementptr inbounds float* %tmp10936, i64 1
+  %tmp10938 = getelementptr inbounds float* %tmp10937, i64 1
+  %tmp10939 = getelementptr inbounds float* %tmp10938, i64 1
+  %tmp10940 = getelementptr inbounds float* %tmp10939, i64 1
+  %tmp10941 = getelementptr inbounds float* %tmp10940, i64 1
+  %tmp10942 = getelementptr inbounds float* %tmp10941, i64 1
+  %tmp10943 = getelementptr inbounds float* %tmp10942, i64 1
+  %tmp10944 = getelementptr inbounds float* %tmp10943, i64 1
+  %tmp10945 = getelementptr inbounds float* %tmp10944, i64 1
+  %tmp10946 = getelementptr inbounds float* %tmp10945, i64 1
+  %tmp10947 = getelementptr inbounds float* %tmp10946, i64 1
+  %tmp10948 = getelementptr inbounds float* %tmp10947, i64 1
+  %tmp10949 = getelementptr inbounds float* %tmp10948, i64 1
+  %tmp10950 = getelementptr inbounds float* %tmp10949, i64 1
+  %tmp10951 = getelementptr inbounds float* %tmp10950, i64 1
+  %tmp10952 = getelementptr inbounds float* %tmp10951, i64 1
+  %tmp10953 = getelementptr inbounds float* %tmp10952, i64 1
+  %tmp10954 = getelementptr inbounds float* %tmp10953, i64 1
+  %tmp10955 = getelementptr inbounds float* %tmp10954, i64 1
+  %tmp10956 = getelementptr inbounds float* %tmp10955, i64 1
+  %tmp10957 = getelementptr inbounds float* %tmp10956, i64 1
+  %tmp10958 = getelementptr inbounds float* %tmp10957, i64 1
+  %tmp10959 = getelementptr inbounds float* %tmp10958, i64 1
+  %tmp10960 = getelementptr inbounds float* %tmp10959, i64 1
+  %tmp10961 = getelementptr inbounds float* %tmp10960, i64 1
+  %tmp10962 = getelementptr inbounds float* %tmp10961, i64 1
+  %tmp10963 = getelementptr inbounds float* %tmp10962, i64 1
+  %tmp10964 = getelementptr inbounds float* %tmp10963, i64 1
+  %tmp10965 = getelementptr inbounds float* %tmp10964, i64 1
+  %tmp10966 = getelementptr inbounds float* %tmp10965, i64 1
+  %tmp10967 = getelementptr inbounds float* %tmp10966, i64 1
+  %tmp10968 = getelementptr inbounds float* %tmp10967, i64 1
+  %tmp10969 = getelementptr inbounds float* %tmp10968, i64 1
+  %tmp10970 = getelementptr inbounds float* %tmp10969, i64 1
+  %tmp10971 = getelementptr inbounds float* %tmp10970, i64 1
+  %tmp10972 = getelementptr inbounds float* %tmp10971, i64 1
+  %tmp10973 = getelementptr inbounds float* %tmp10972, i64 1
+  %tmp10974 = getelementptr inbounds float* %tmp10973, i64 1
+  %tmp10975 = getelementptr inbounds float* %tmp10974, i64 1
+  %tmp10976 = getelementptr inbounds float* %tmp10975, i64 1
+  %tmp10977 = getelementptr inbounds float* %tmp10976, i64 1
+  %tmp10978 = getelementptr inbounds float* %tmp10977, i64 1
+  %tmp10979 = getelementptr inbounds float* %tmp10978, i64 1
+  %tmp10980 = getelementptr inbounds float* %tmp10979, i64 1
+  %tmp10981 = getelementptr inbounds float* %tmp10980, i64 1
+  %tmp10982 = getelementptr inbounds float* %tmp10981, i64 1
+  %tmp10983 = getelementptr inbounds float* %tmp10982, i64 1
+  %tmp10984 = getelementptr inbounds float* %tmp10983, i64 1
+  %tmp10985 = getelementptr inbounds float* %tmp10984, i64 1
+  %tmp10986 = getelementptr inbounds float* %tmp10985, i64 1
+  %tmp10987 = getelementptr inbounds float* %tmp10986, i64 1
+  %tmp10988 = getelementptr inbounds float* %tmp10987, i64 1
+  %tmp10989 = getelementptr inbounds float* %tmp10988, i64 1
+  %tmp10990 = getelementptr inbounds float* %tmp10989, i64 1
+  %tmp10991 = getelementptr inbounds float* %tmp10990, i64 1
+  %tmp10992 = getelementptr inbounds float* %tmp10991, i64 1
+  %tmp10993 = getelementptr inbounds float* %tmp10992, i64 1
+  %tmp10994 = getelementptr inbounds float* %tmp10993, i64 1
+  %tmp10995 = getelementptr inbounds float* %tmp10994, i64 1
+  %tmp10996 = getelementptr inbounds float* %tmp10995, i64 1
+  %tmp10997 = getelementptr inbounds float* %tmp10996, i64 1
+  %tmp10998 = getelementptr inbounds float* %tmp10997, i64 1
+  %tmp10999 = getelementptr inbounds float* %tmp10998, i64 1
+  %tmp11000 = getelementptr inbounds float* %tmp10999, i64 1
+  %tmp11001 = getelementptr inbounds float* %tmp11000, i64 1
+  %tmp11002 = getelementptr inbounds float* %tmp11001, i64 1
+  %tmp11003 = getelementptr inbounds float* %tmp11002, i64 1
+  %tmp11004 = getelementptr inbounds float* %tmp11003, i64 1
+  %tmp11005 = getelementptr inbounds float* %tmp11004, i64 1
+  %tmp11006 = getelementptr inbounds float* %tmp11005, i64 1
+  %tmp11007 = getelementptr inbounds float* %tmp11006, i64 1
+  %tmp11008 = getelementptr inbounds float* %tmp11007, i64 1
+  %tmp11009 = getelementptr inbounds float* %tmp11008, i64 1
+  %tmp11010 = getelementptr inbounds float* %tmp11009, i64 1
+  %tmp11011 = getelementptr inbounds float* %tmp11010, i64 1
+  %tmp11012 = getelementptr inbounds float* %tmp11011, i64 1
+  %tmp11013 = getelementptr inbounds float* %tmp11012, i64 1
+  %tmp11014 = getelementptr inbounds float* %tmp11013, i64 1
+  %tmp11015 = getelementptr inbounds float* %tmp11014, i64 1
+  %tmp11016 = getelementptr inbounds float* %tmp11015, i64 1
+  %tmp11017 = getelementptr inbounds float* %tmp11016, i64 1
+  %tmp11018 = getelementptr inbounds float* %tmp11017, i64 1
+  %tmp11019 = getelementptr inbounds float* %tmp11018, i64 1
+  %tmp11020 = getelementptr inbounds float* %tmp11019, i64 1
+  %tmp11021 = getelementptr inbounds float* %tmp11020, i64 1
+  %tmp11022 = getelementptr inbounds float* %tmp11021, i64 1
+  %tmp11023 = getelementptr inbounds float* %tmp11022, i64 1
+  %tmp11024 = getelementptr inbounds float* %tmp11023, i64 1
+  %tmp11025 = getelementptr inbounds float* %tmp11024, i64 1
+  %tmp11026 = getelementptr inbounds float* %tmp11025, i64 1
+  %tmp11027 = getelementptr inbounds float* %tmp11026, i64 1
+  %tmp11028 = getelementptr inbounds float* %tmp11027, i64 1
+  %tmp11029 = getelementptr inbounds float* %tmp11028, i64 1
+  %tmp11030 = getelementptr inbounds float* %tmp11029, i64 1
+  %tmp11031 = getelementptr inbounds float* %tmp11030, i64 1
+  %tmp11032 = getelementptr inbounds float* %tmp11031, i64 1
+  %tmp11033 = getelementptr inbounds float* %tmp11032, i64 1
+  %tmp11034 = getelementptr inbounds float* %tmp11033, i64 1
+  %tmp11035 = getelementptr inbounds float* %tmp11034, i64 1
+  %tmp11036 = getelementptr inbounds float* %tmp11035, i64 1
+  %tmp11037 = getelementptr inbounds float* %tmp11036, i64 1
+  %tmp11038 = getelementptr inbounds float* %tmp11037, i64 1
+  %tmp11039 = getelementptr inbounds float* %tmp11038, i64 1
+  %tmp11040 = getelementptr inbounds float* %tmp11039, i64 1
+  %tmp11041 = getelementptr inbounds float* %tmp11040, i64 1
+  %tmp11042 = getelementptr inbounds float* %tmp11041, i64 1
+  %tmp11043 = getelementptr inbounds float* %tmp11042, i64 1
+  %tmp11044 = getelementptr inbounds float* %tmp11043, i64 1
+  %tmp11045 = getelementptr inbounds float* %tmp11044, i64 1
+  %tmp11046 = getelementptr inbounds float* %tmp11045, i64 1
+  %tmp11047 = getelementptr inbounds float* %tmp11046, i64 1
+  %tmp11048 = getelementptr inbounds float* %tmp11047, i64 1
+  %tmp11049 = getelementptr inbounds float* %tmp11048, i64 1
+  %tmp11050 = getelementptr inbounds float* %tmp11049, i64 1
+  %tmp11051 = getelementptr inbounds float* %tmp11050, i64 1
+  %tmp11052 = getelementptr inbounds float* %tmp11051, i64 1
+  %tmp11053 = getelementptr inbounds float* %tmp11052, i64 1
+  %tmp11054 = getelementptr inbounds float* %tmp11053, i64 1
+  %tmp11055 = getelementptr inbounds float* %tmp11054, i64 1
+  %tmp11056 = getelementptr inbounds float* %tmp11055, i64 1
+  %tmp11057 = getelementptr inbounds float* %tmp11056, i64 1
+  %tmp11058 = getelementptr inbounds float* %tmp11057, i64 1
+  %tmp11059 = getelementptr inbounds float* %tmp11058, i64 1
+  %tmp11060 = getelementptr inbounds float* %tmp11059, i64 1
+  %tmp11061 = getelementptr inbounds float* %tmp11060, i64 1
+  %tmp11062 = getelementptr inbounds float* %tmp11061, i64 1
+  %tmp11063 = getelementptr inbounds float* %tmp11062, i64 1
+  %tmp11064 = getelementptr inbounds float* %tmp11063, i64 1
+  %tmp11065 = getelementptr inbounds float* %tmp11064, i64 1
+  %tmp11066 = getelementptr inbounds float* %tmp11065, i64 1
+  %tmp11067 = getelementptr inbounds float* %tmp11066, i64 1
+  %tmp11068 = getelementptr inbounds float* %tmp11067, i64 1
+  %tmp11069 = getelementptr inbounds float* %tmp11068, i64 1
+  %tmp11070 = getelementptr inbounds float* %tmp11069, i64 1
+  %tmp11071 = getelementptr inbounds float* %tmp11070, i64 1
+  %tmp11072 = getelementptr inbounds float* %tmp11071, i64 1
+  %tmp11073 = getelementptr inbounds float* %tmp11072, i64 1
+  %tmp11074 = getelementptr inbounds float* %tmp11073, i64 1
+  %tmp11075 = getelementptr inbounds float* %tmp11074, i64 1
+  %tmp11076 = getelementptr inbounds float* %tmp11075, i64 1
+  %tmp11077 = getelementptr inbounds float* %tmp11076, i64 1
+  %tmp11078 = getelementptr inbounds float* %tmp11077, i64 1
+  %tmp11079 = getelementptr inbounds float* %tmp11078, i64 1
+  %tmp11080 = getelementptr inbounds float* %tmp11079, i64 1
+  %tmp11081 = getelementptr inbounds float* %tmp11080, i64 1
+  %tmp11082 = getelementptr inbounds float* %tmp11081, i64 1
+  %tmp11083 = getelementptr inbounds float* %tmp11082, i64 1
+  %tmp11084 = getelementptr inbounds float* %tmp11083, i64 1
+  %tmp11085 = getelementptr inbounds float* %tmp11084, i64 1
+  %tmp11086 = getelementptr inbounds float* %tmp11085, i64 1
+  %tmp11087 = getelementptr inbounds float* %tmp11086, i64 1
+  %tmp11088 = getelementptr inbounds float* %tmp11087, i64 1
+  %tmp11089 = getelementptr inbounds float* %tmp11088, i64 1
+  %tmp11090 = getelementptr inbounds float* %tmp11089, i64 1
+  %tmp11091 = getelementptr inbounds float* %tmp11090, i64 1
+  %tmp11092 = getelementptr inbounds float* %tmp11091, i64 1
+  %tmp11093 = getelementptr inbounds float* %tmp11092, i64 1
+  %tmp11094 = getelementptr inbounds float* %tmp11093, i64 1
+  %tmp11095 = getelementptr inbounds float* %tmp11094, i64 1
+  %tmp11096 = getelementptr inbounds float* %tmp11095, i64 1
+  %tmp11097 = getelementptr inbounds float* %tmp11096, i64 1
+  %tmp11098 = getelementptr inbounds float* %tmp11097, i64 1
+  %tmp11099 = getelementptr inbounds float* %tmp11098, i64 1
+  %tmp11100 = getelementptr inbounds float* %tmp11099, i64 1
+  %tmp11101 = getelementptr inbounds float* %tmp11100, i64 1
+  %tmp11102 = getelementptr inbounds float* %tmp11101, i64 1
+  %tmp11103 = getelementptr inbounds float* %tmp11102, i64 1
+  %tmp11104 = getelementptr inbounds float* %tmp11103, i64 1
+  %tmp11105 = getelementptr inbounds float* %tmp11104, i64 1
+  %tmp11106 = getelementptr inbounds float* %tmp11105, i64 1
+  %tmp11107 = getelementptr inbounds float* %tmp11106, i64 1
+  %tmp11108 = getelementptr inbounds float* %tmp11107, i64 1
+  %tmp11109 = getelementptr inbounds float* %tmp11108, i64 1
+  %tmp11110 = getelementptr inbounds float* %tmp11109, i64 1
+  %tmp11111 = getelementptr inbounds float* %tmp11110, i64 1
+  %tmp11112 = getelementptr inbounds float* %tmp11111, i64 1
+  %tmp11113 = getelementptr inbounds float* %tmp11112, i64 1
+  %tmp11114 = getelementptr inbounds float* %tmp11113, i64 1
+  %tmp11115 = getelementptr inbounds float* %tmp11114, i64 1
+  %tmp11116 = getelementptr inbounds float* %tmp11115, i64 1
+  %tmp11117 = getelementptr inbounds float* %tmp11116, i64 1
+  %tmp11118 = getelementptr inbounds float* %tmp11117, i64 1
+  %tmp11119 = getelementptr inbounds float* %tmp11118, i64 1
+  %tmp11120 = getelementptr inbounds float* %tmp11119, i64 1
+  %tmp11121 = getelementptr inbounds float* %tmp11120, i64 1
+  %tmp11122 = getelementptr inbounds float* %tmp11121, i64 1
+  %tmp11123 = getelementptr inbounds float* %tmp11122, i64 1
+  %tmp11124 = getelementptr inbounds float* %tmp11123, i64 1
+  %tmp11125 = getelementptr inbounds float* %tmp11124, i64 1
+  %tmp11126 = getelementptr inbounds float* %tmp11125, i64 1
+  %tmp11127 = getelementptr inbounds float* %tmp11126, i64 1
+  %tmp11128 = getelementptr inbounds float* %tmp11127, i64 1
+  %tmp11129 = getelementptr inbounds float* %tmp11128, i64 1
+  %tmp11130 = getelementptr inbounds float* %tmp11129, i64 1
+  %tmp11131 = getelementptr inbounds float* %tmp11130, i64 1
+  %tmp11132 = getelementptr inbounds float* %tmp11131, i64 1
+  %tmp11133 = getelementptr inbounds float* %tmp11132, i64 1
+  %tmp11134 = getelementptr inbounds float* %tmp11133, i64 1
+  %tmp11135 = getelementptr inbounds float* %tmp11134, i64 1
+  %tmp11136 = getelementptr inbounds float* %tmp11135, i64 1
+  %tmp11137 = getelementptr inbounds float* %tmp11136, i64 1
+  %tmp11138 = getelementptr inbounds float* %tmp11137, i64 1
+  %tmp11139 = getelementptr inbounds float* %tmp11138, i64 1
+  %tmp11140 = getelementptr inbounds float* %tmp11139, i64 1
+  %tmp11141 = getelementptr inbounds float* %tmp11140, i64 1
+  %tmp11142 = getelementptr inbounds float* %tmp11141, i64 1
+  %tmp11143 = getelementptr inbounds float* %tmp11142, i64 1
+  %tmp11144 = getelementptr inbounds float* %tmp11143, i64 1
+  %tmp11145 = getelementptr inbounds float* %tmp11144, i64 1
+  %tmp11146 = getelementptr inbounds float* %tmp11145, i64 1
+  %tmp11147 = getelementptr inbounds float* %tmp11146, i64 1
+  %tmp11148 = getelementptr inbounds float* %tmp11147, i64 1
+  %tmp11149 = getelementptr inbounds float* %tmp11148, i64 1
+  %tmp11150 = getelementptr inbounds float* %tmp11149, i64 1
+  %tmp11151 = getelementptr inbounds float* %tmp11150, i64 1
+  %tmp11152 = getelementptr inbounds float* %tmp11151, i64 1
+  %tmp11153 = getelementptr inbounds float* %tmp11152, i64 1
+  %tmp11154 = getelementptr inbounds float* %tmp11153, i64 1
+  %tmp11155 = getelementptr inbounds float* %tmp11154, i64 1
+  %tmp11156 = getelementptr inbounds float* %tmp11155, i64 1
+  %tmp11157 = getelementptr inbounds float* %tmp11156, i64 1
+  %tmp11158 = getelementptr inbounds float* %tmp11157, i64 1
+  %tmp11159 = getelementptr inbounds float* %tmp11158, i64 1
+  %tmp11160 = getelementptr inbounds float* %tmp11159, i64 1
+  %tmp11161 = getelementptr inbounds float* %tmp11160, i64 1
+  %tmp11162 = getelementptr inbounds float* %tmp11161, i64 1
+  %tmp11163 = getelementptr inbounds float* %tmp11162, i64 1
+  %tmp11164 = getelementptr inbounds float* %tmp11163, i64 1
+  %tmp11165 = getelementptr inbounds float* %tmp11164, i64 1
+  %tmp11166 = getelementptr inbounds float* %tmp11165, i64 1
+  %tmp11167 = getelementptr inbounds float* %tmp11166, i64 1
+  %tmp11168 = getelementptr inbounds float* %tmp11167, i64 1
+  %tmp11169 = getelementptr inbounds float* %tmp11168, i64 1
+  %tmp11170 = getelementptr inbounds float* %tmp11169, i64 1
+  %tmp11171 = getelementptr inbounds float* %tmp11170, i64 1
+  %tmp11172 = getelementptr inbounds float* %tmp11171, i64 1
+  %tmp11173 = getelementptr inbounds float* %tmp11172, i64 1
+  %tmp11174 = getelementptr inbounds float* %tmp11173, i64 1
+  %tmp11175 = getelementptr inbounds float* %tmp11174, i64 1
+  %tmp11176 = getelementptr inbounds float* %tmp11175, i64 1
+  %tmp11177 = getelementptr inbounds float* %tmp11176, i64 1
+  %tmp11178 = getelementptr inbounds float* %tmp11177, i64 1
+  %tmp11179 = getelementptr inbounds float* %tmp11178, i64 1
+  %tmp11180 = getelementptr inbounds float* %tmp11179, i64 1
+  %tmp11181 = getelementptr inbounds float* %tmp11180, i64 1
+  %tmp11182 = getelementptr inbounds float* %tmp11181, i64 1
+  %tmp11183 = getelementptr inbounds float* %tmp11182, i64 1
+  %tmp11184 = getelementptr inbounds float* %tmp11183, i64 1
+  %tmp11185 = getelementptr inbounds float* %tmp11184, i64 1
+  %tmp11186 = getelementptr inbounds float* %tmp11185, i64 1
+  %tmp11187 = getelementptr inbounds float* %tmp11186, i64 1
+  %tmp11188 = getelementptr inbounds float* %tmp11187, i64 1
+  %tmp11189 = getelementptr inbounds float* %tmp11188, i64 1
+  %tmp11190 = getelementptr inbounds float* %tmp11189, i64 1
+  %tmp11191 = getelementptr inbounds float* %tmp11190, i64 1
+  %tmp11192 = getelementptr inbounds float* %tmp11191, i64 1
+  %tmp11193 = getelementptr inbounds float* %tmp11192, i64 1
+  %tmp11194 = getelementptr inbounds float* %tmp11193, i64 1
+  %tmp11195 = getelementptr inbounds float* %tmp11194, i64 1
+  %tmp11196 = getelementptr inbounds float* %tmp11195, i64 1
+  %tmp11197 = getelementptr inbounds float* %tmp11196, i64 1
+  %tmp11198 = getelementptr inbounds float* %tmp11197, i64 1
+  %tmp11199 = getelementptr inbounds float* %tmp11198, i64 1
+  %tmp11200 = getelementptr inbounds float* %tmp11199, i64 1
+  %tmp11201 = getelementptr inbounds float* %tmp11200, i64 1
+  %tmp11202 = getelementptr inbounds float* %tmp11201, i64 1
+  %tmp11203 = getelementptr inbounds float* %tmp11202, i64 1
+  %tmp11204 = getelementptr inbounds float* %tmp11203, i64 1
+  %tmp11205 = getelementptr inbounds float* %tmp11204, i64 1
+  %tmp11206 = getelementptr inbounds float* %tmp11205, i64 1
+  %tmp11207 = getelementptr inbounds float* %tmp11206, i64 1
+  %tmp11208 = getelementptr inbounds float* %tmp11207, i64 1
+  %tmp11209 = getelementptr inbounds float* %tmp11208, i64 1
+  %tmp11210 = getelementptr inbounds float* %tmp11209, i64 1
+  %tmp11211 = getelementptr inbounds float* %tmp11210, i64 1
+  %tmp11212 = getelementptr inbounds float* %tmp11211, i64 1
+  %tmp11213 = getelementptr inbounds float* %tmp11212, i64 1
+  %tmp11214 = getelementptr inbounds float* %tmp11213, i64 1
+  %tmp11215 = getelementptr inbounds float* %tmp11214, i64 1
+  %tmp11216 = getelementptr inbounds float* %tmp11215, i64 1
+  %tmp11217 = getelementptr inbounds float* %tmp11216, i64 1
+  %tmp11218 = getelementptr inbounds float* %tmp11217, i64 1
+  %tmp11219 = getelementptr inbounds float* %tmp11218, i64 1
+  %tmp11220 = getelementptr inbounds float* %tmp11219, i64 1
+  %tmp11221 = getelementptr inbounds float* %tmp11220, i64 1
+  %tmp11222 = getelementptr inbounds float* %tmp11221, i64 1
+  %tmp11223 = getelementptr inbounds float* %tmp11222, i64 1
+  %tmp11224 = getelementptr inbounds float* %tmp11223, i64 1
+  %tmp11225 = getelementptr inbounds float* %tmp11224, i64 1
+  %tmp11226 = getelementptr inbounds float* %tmp11225, i64 1
+  %tmp11227 = getelementptr inbounds float* %tmp11226, i64 1
+  %tmp11228 = getelementptr inbounds float* %tmp11227, i64 1
+  %tmp11229 = getelementptr inbounds float* %tmp11228, i64 1
+  %tmp11230 = getelementptr inbounds float* %tmp11229, i64 1
+  %tmp11231 = getelementptr inbounds float* %tmp11230, i64 1
+  %tmp11232 = getelementptr inbounds float* %tmp11231, i64 1
+  %tmp11233 = getelementptr inbounds float* %tmp11232, i64 1
+  %tmp11234 = getelementptr inbounds float* %tmp11233, i64 1
+  %tmp11235 = getelementptr inbounds float* %tmp11234, i64 1
+  %tmp11236 = getelementptr inbounds float* %tmp11235, i64 1
+  %tmp11237 = getelementptr inbounds float* %tmp11236, i64 1
+  %tmp11238 = getelementptr inbounds float* %tmp11237, i64 1
+  %tmp11239 = getelementptr inbounds float* %tmp11238, i64 1
+  %tmp11240 = getelementptr inbounds float* %tmp11239, i64 1
+  %tmp11241 = getelementptr inbounds float* %tmp11240, i64 1
+  %tmp11242 = getelementptr inbounds float* %tmp11241, i64 1
+  %tmp11243 = getelementptr inbounds float* %tmp11242, i64 1
+  %tmp11244 = getelementptr inbounds float* %tmp11243, i64 1
+  %tmp11245 = getelementptr inbounds float* %tmp11244, i64 1
+  %tmp11246 = getelementptr inbounds float* %tmp11245, i64 1
+  %tmp11247 = getelementptr inbounds float* %tmp11246, i64 1
+  %tmp11248 = getelementptr inbounds float* %tmp11247, i64 1
+  %tmp11249 = getelementptr inbounds float* %tmp11248, i64 1
+  %tmp11250 = getelementptr inbounds float* %tmp11249, i64 1
+  %tmp11251 = getelementptr inbounds float* %tmp11250, i64 1
+  %tmp11252 = getelementptr inbounds float* %tmp11251, i64 1
+  %tmp11253 = getelementptr inbounds float* %tmp11252, i64 1
+  %tmp11254 = getelementptr inbounds float* %tmp11253, i64 1
+  %tmp11255 = getelementptr inbounds float* %tmp11254, i64 1
+  %tmp11256 = getelementptr inbounds float* %tmp11255, i64 1
+  %tmp11257 = getelementptr inbounds float* %tmp11256, i64 1
+  %tmp11258 = getelementptr inbounds float* %tmp11257, i64 1
+  %tmp11259 = getelementptr inbounds float* %tmp11258, i64 1
+  %tmp11260 = getelementptr inbounds float* %tmp11259, i64 1
+  %tmp11261 = getelementptr inbounds float* %tmp11260, i64 1
+  %tmp11262 = getelementptr inbounds float* %tmp11261, i64 1
+  %tmp11263 = getelementptr inbounds float* %tmp11262, i64 1
+  %tmp11264 = getelementptr inbounds float* %tmp11263, i64 1
+  %tmp11265 = getelementptr inbounds float* %tmp11264, i64 1
+  %tmp11266 = getelementptr inbounds float* %tmp11265, i64 1
+  %tmp11267 = getelementptr inbounds float* %tmp11266, i64 1
+  %tmp11268 = getelementptr inbounds float* %tmp11267, i64 1
+  %tmp11269 = getelementptr inbounds float* %tmp11268, i64 1
+  %tmp11270 = getelementptr inbounds float* %tmp11269, i64 1
+  %tmp11271 = getelementptr inbounds float* %tmp11270, i64 1
+  %tmp11272 = getelementptr inbounds float* %tmp11271, i64 1
+  %tmp11273 = getelementptr inbounds float* %tmp11272, i64 1
+  %tmp11274 = getelementptr inbounds float* %tmp11273, i64 1
+  %tmp11275 = getelementptr inbounds float* %tmp11274, i64 1
+  %tmp11276 = getelementptr inbounds float* %tmp11275, i64 1
+  %tmp11277 = getelementptr inbounds float* %tmp11276, i64 1
+  %tmp11278 = getelementptr inbounds float* %tmp11277, i64 1
+  %tmp11279 = getelementptr inbounds float* %tmp11278, i64 1
+  %tmp11280 = getelementptr inbounds float* %tmp11279, i64 1
+  %tmp11281 = getelementptr inbounds float* %tmp11280, i64 1
+  %tmp11282 = getelementptr inbounds float* %tmp11281, i64 1
+  %tmp11283 = getelementptr inbounds float* %tmp11282, i64 1
+  %tmp11284 = getelementptr inbounds float* %tmp11283, i64 1
+  %tmp11285 = getelementptr inbounds float* %tmp11284, i64 1
+  %tmp11286 = getelementptr inbounds float* %tmp11285, i64 1
+  %tmp11287 = getelementptr inbounds float* %tmp11286, i64 1
+  %tmp11288 = getelementptr inbounds float* %tmp11287, i64 1
+  %tmp11289 = getelementptr inbounds float* %tmp11288, i64 1
+  %tmp11290 = getelementptr inbounds float* %tmp11289, i64 1
+  %tmp11291 = getelementptr inbounds float* %tmp11290, i64 1
+  %tmp11292 = getelementptr inbounds float* %tmp11291, i64 1
+  %tmp11293 = getelementptr inbounds float* %tmp11292, i64 1
+  %tmp11294 = getelementptr inbounds float* %tmp11293, i64 1
+  %tmp11295 = getelementptr inbounds float* %tmp11294, i64 1
+  %tmp11296 = getelementptr inbounds float* %tmp11295, i64 1
+  %tmp11297 = getelementptr inbounds float* %tmp11296, i64 1
+  %tmp11298 = getelementptr inbounds float* %tmp11297, i64 1
+  %tmp11299 = getelementptr inbounds float* %tmp11298, i64 1
+  %tmp11300 = getelementptr inbounds float* %tmp11299, i64 1
+  %tmp11301 = getelementptr inbounds float* %tmp11300, i64 1
+  %tmp11302 = getelementptr inbounds float* %tmp11301, i64 1
+  %tmp11303 = getelementptr inbounds float* %tmp11302, i64 1
+  %tmp11304 = getelementptr inbounds float* %tmp11303, i64 1
+  %tmp11305 = getelementptr inbounds float* %tmp11304, i64 1
+  %tmp11306 = getelementptr inbounds float* %tmp11305, i64 1
+  %tmp11307 = getelementptr inbounds float* %tmp11306, i64 1
+  %tmp11308 = getelementptr inbounds float* %tmp11307, i64 1
+  %tmp11309 = getelementptr inbounds float* %tmp11308, i64 1
+  %tmp11310 = getelementptr inbounds float* %tmp11309, i64 1
+  %tmp11311 = getelementptr inbounds float* %tmp11310, i64 1
+  %tmp11312 = getelementptr inbounds float* %tmp11311, i64 1
+  %tmp11313 = getelementptr inbounds float* %tmp11312, i64 1
+  %tmp11314 = getelementptr inbounds float* %tmp11313, i64 1
+  %tmp11315 = getelementptr inbounds float* %tmp11314, i64 1
+  %tmp11316 = getelementptr inbounds float* %tmp11315, i64 1
+  %tmp11317 = getelementptr inbounds float* %tmp11316, i64 1
+  %tmp11318 = getelementptr inbounds float* %tmp11317, i64 1
+  %tmp11319 = getelementptr inbounds float* %tmp11318, i64 1
+  %tmp11320 = getelementptr inbounds float* %tmp11319, i64 1
+  %tmp11321 = getelementptr inbounds float* %tmp11320, i64 1
+  %tmp11322 = getelementptr inbounds float* %tmp11321, i64 1
+  %tmp11323 = getelementptr inbounds float* %tmp11322, i64 1
+  %tmp11324 = getelementptr inbounds float* %tmp11323, i64 1
+  %tmp11325 = getelementptr inbounds float* %tmp11324, i64 1
+  %tmp11326 = getelementptr inbounds float* %tmp11325, i64 1
+  %tmp11327 = getelementptr inbounds float* %tmp11326, i64 1
+  %tmp11328 = getelementptr inbounds float* %tmp11327, i64 1
+  %tmp11329 = getelementptr inbounds float* %tmp11328, i64 1
+  %tmp11330 = getelementptr inbounds float* %tmp11329, i64 1
+  %tmp11331 = getelementptr inbounds float* %tmp11330, i64 1
+  %tmp11332 = getelementptr inbounds float* %tmp11331, i64 1
+  %tmp11333 = getelementptr inbounds float* %tmp11332, i64 1
+  %tmp11334 = getelementptr inbounds float* %tmp11333, i64 1
+  %tmp11335 = getelementptr inbounds float* %tmp11334, i64 1
+  %tmp11336 = getelementptr inbounds float* %tmp11335, i64 1
+  %tmp11337 = getelementptr inbounds float* %tmp11336, i64 1
+  %tmp11338 = getelementptr inbounds float* %tmp11337, i64 1
+  %tmp11339 = getelementptr inbounds float* %tmp11338, i64 1
+  %tmp11340 = getelementptr inbounds float* %tmp11339, i64 1
+  %tmp11341 = getelementptr inbounds float* %tmp11340, i64 1
+  %tmp11342 = getelementptr inbounds float* %tmp11341, i64 1
+  %tmp11343 = getelementptr inbounds float* %tmp11342, i64 1
+  %tmp11344 = getelementptr inbounds float* %tmp11343, i64 1
+  %tmp11345 = getelementptr inbounds float* %tmp11344, i64 1
+  %tmp11346 = getelementptr inbounds float* %tmp11345, i64 1
+  %tmp11347 = getelementptr inbounds float* %tmp11346, i64 1
+  %tmp11348 = getelementptr inbounds float* %tmp11347, i64 1
+  %tmp11349 = getelementptr inbounds float* %tmp11348, i64 1
+  %tmp11350 = getelementptr inbounds float* %tmp11349, i64 1
+  %tmp11351 = getelementptr inbounds float* %tmp11350, i64 1
+  %tmp11352 = getelementptr inbounds float* %tmp11351, i64 1
+  %tmp11353 = getelementptr inbounds float* %tmp11352, i64 1
+  %tmp11354 = getelementptr inbounds float* %tmp11353, i64 1
+  %tmp11355 = getelementptr inbounds float* %tmp11354, i64 1
+  %tmp11356 = getelementptr inbounds float* %tmp11355, i64 1
+  %tmp11357 = getelementptr inbounds float* %tmp11356, i64 1
+  %tmp11358 = getelementptr inbounds float* %tmp11357, i64 1
+  %tmp11359 = getelementptr inbounds float* %tmp11358, i64 1
+  %tmp11360 = getelementptr inbounds float* %tmp11359, i64 1
+  %tmp11361 = getelementptr inbounds float* %tmp11360, i64 1
+  %tmp11362 = getelementptr inbounds float* %tmp11361, i64 1
+  %tmp11363 = getelementptr inbounds float* %tmp11362, i64 1
+  %tmp11364 = getelementptr inbounds float* %tmp11363, i64 1
+  %tmp11365 = getelementptr inbounds float* %tmp11364, i64 1
+  %tmp11366 = getelementptr inbounds float* %tmp11365, i64 1
+  %tmp11367 = getelementptr inbounds float* %tmp11366, i64 1
+  %tmp11368 = getelementptr inbounds float* %tmp11367, i64 1
+  %tmp11369 = getelementptr inbounds float* %tmp11368, i64 1
+  %tmp11370 = getelementptr inbounds float* %tmp11369, i64 1
+  %tmp11371 = getelementptr inbounds float* %tmp11370, i64 1
+  %tmp11372 = getelementptr inbounds float* %tmp11371, i64 1
+  %tmp11373 = getelementptr inbounds float* %tmp11372, i64 1
+  %tmp11374 = getelementptr inbounds float* %tmp11373, i64 1
+  %tmp11375 = getelementptr inbounds float* %tmp11374, i64 1
+  %tmp11376 = getelementptr inbounds float* %tmp11375, i64 1
+  %tmp11377 = getelementptr inbounds float* %tmp11376, i64 1
+  %tmp11378 = getelementptr inbounds float* %tmp11377, i64 1
+  %tmp11379 = getelementptr inbounds float* %tmp11378, i64 1
+  %tmp11380 = getelementptr inbounds float* %tmp11379, i64 1
+  %tmp11381 = getelementptr inbounds float* %tmp11380, i64 1
+  %tmp11382 = getelementptr inbounds float* %tmp11381, i64 1
+  %tmp11383 = getelementptr inbounds float* %tmp11382, i64 1
+  %tmp11384 = getelementptr inbounds float* %tmp11383, i64 1
+  %tmp11385 = getelementptr inbounds float* %tmp11384, i64 1
+  %tmp11386 = getelementptr inbounds float* %tmp11385, i64 1
+  %tmp11387 = getelementptr inbounds float* %tmp11386, i64 1
+  %tmp11388 = getelementptr inbounds float* %tmp11387, i64 1
+  %tmp11389 = getelementptr inbounds float* %tmp11388, i64 1
+  %tmp11390 = getelementptr inbounds float* %tmp11389, i64 1
+  %tmp11391 = getelementptr inbounds float* %tmp11390, i64 1
+  %tmp11392 = getelementptr inbounds float* %tmp11391, i64 1
+  %tmp11393 = getelementptr inbounds float* %tmp11392, i64 1
+  %tmp11394 = getelementptr inbounds float* %tmp11393, i64 1
+  %tmp11395 = getelementptr inbounds float* %tmp11394, i64 1
+  %tmp11396 = getelementptr inbounds float* %tmp11395, i64 1
+  %tmp11397 = getelementptr inbounds float* %tmp11396, i64 1
+  %tmp11398 = getelementptr inbounds float* %tmp11397, i64 1
+  %tmp11399 = getelementptr inbounds float* %tmp11398, i64 1
+  %tmp11400 = getelementptr inbounds float* %tmp11399, i64 1
+  %tmp11401 = getelementptr inbounds float* %tmp11400, i64 1
+  %tmp11402 = getelementptr inbounds float* %tmp11401, i64 1
+  %tmp11403 = getelementptr inbounds float* %tmp11402, i64 1
+  %tmp11404 = getelementptr inbounds float* %tmp11403, i64 1
+  %tmp11405 = getelementptr inbounds float* %tmp11404, i64 1
+  %tmp11406 = getelementptr inbounds float* %tmp11405, i64 1
+  %tmp11407 = getelementptr inbounds float* %tmp11406, i64 1
+  %tmp11408 = getelementptr inbounds float* %tmp11407, i64 1
+  %tmp11409 = getelementptr inbounds float* %tmp11408, i64 1
+  %tmp11410 = getelementptr inbounds float* %tmp11409, i64 1
+  %tmp11411 = getelementptr inbounds float* %tmp11410, i64 1
+  %tmp11412 = getelementptr inbounds float* %tmp11411, i64 1
+  %tmp11413 = getelementptr inbounds float* %tmp11412, i64 1
+  %tmp11414 = getelementptr inbounds float* %tmp11413, i64 1
+  %tmp11415 = getelementptr inbounds float* %tmp11414, i64 1
+  %tmp11416 = getelementptr inbounds float* %tmp11415, i64 1
+  %tmp11417 = getelementptr inbounds float* %tmp11416, i64 1
+  %tmp11418 = getelementptr inbounds float* %tmp11417, i64 1
+  %tmp11419 = getelementptr inbounds float* %tmp11418, i64 1
+  %tmp11420 = getelementptr inbounds float* %tmp11419, i64 1
+  %tmp11421 = getelementptr inbounds float* %tmp11420, i64 1
+  %tmp11422 = getelementptr inbounds float* %tmp11421, i64 1
+  %tmp11423 = getelementptr inbounds float* %tmp11422, i64 1
+  %tmp11424 = getelementptr inbounds float* %tmp11423, i64 1
+  %tmp11425 = getelementptr inbounds float* %tmp11424, i64 1
+  %tmp11426 = getelementptr inbounds float* %tmp11425, i64 1
+  %tmp11427 = getelementptr inbounds float* %tmp11426, i64 1
+  %tmp11428 = getelementptr inbounds float* %tmp11427, i64 1
+  %tmp11429 = getelementptr inbounds float* %tmp11428, i64 1
+  %tmp11430 = getelementptr inbounds float* %tmp11429, i64 1
+  %tmp11431 = getelementptr inbounds float* %tmp11430, i64 1
+  %tmp11432 = getelementptr inbounds float* %tmp11431, i64 1
+  %tmp11433 = getelementptr inbounds float* %tmp11432, i64 1
+  %tmp11434 = getelementptr inbounds float* %tmp11433, i64 1
+  %tmp11435 = getelementptr inbounds float* %tmp11434, i64 1
+  %tmp11436 = getelementptr inbounds float* %tmp11435, i64 1
+  %tmp11437 = getelementptr inbounds float* %tmp11436, i64 1
+  %tmp11438 = getelementptr inbounds float* %tmp11437, i64 1
+  %tmp11439 = getelementptr inbounds float* %tmp11438, i64 1
+  %tmp11440 = getelementptr inbounds float* %tmp11439, i64 1
+  %tmp11441 = getelementptr inbounds float* %tmp11440, i64 1
+  %tmp11442 = getelementptr inbounds float* %tmp11441, i64 1
+  %tmp11443 = getelementptr inbounds float* %tmp11442, i64 1
+  %tmp11444 = getelementptr inbounds float* %tmp11443, i64 1
+  %tmp11445 = getelementptr inbounds float* %tmp11444, i64 1
+  %tmp11446 = getelementptr inbounds float* %tmp11445, i64 1
+  %tmp11447 = getelementptr inbounds float* %tmp11446, i64 1
+  %tmp11448 = getelementptr inbounds float* %tmp11447, i64 1
+  %tmp11449 = getelementptr inbounds float* %tmp11448, i64 1
+  %tmp11450 = getelementptr inbounds float* %tmp11449, i64 1
+  %tmp11451 = getelementptr inbounds float* %tmp11450, i64 1
+  %tmp11452 = getelementptr inbounds float* %tmp11451, i64 1
+  %tmp11453 = getelementptr inbounds float* %tmp11452, i64 1
+  %tmp11454 = getelementptr inbounds float* %tmp11453, i64 1
+  %tmp11455 = getelementptr inbounds float* %tmp11454, i64 1
+  %tmp11456 = getelementptr inbounds float* %tmp11455, i64 1
+  %tmp11457 = getelementptr inbounds float* %tmp11456, i64 1
+  %tmp11458 = getelementptr inbounds float* %tmp11457, i64 1
+  %tmp11459 = getelementptr inbounds float* %tmp11458, i64 1
+  %tmp11460 = getelementptr inbounds float* %tmp11459, i64 1
+  %tmp11461 = getelementptr inbounds float* %tmp11460, i64 1
+  %tmp11462 = getelementptr inbounds float* %tmp11461, i64 1
+  %tmp11463 = getelementptr inbounds float* %tmp11462, i64 1
+  %tmp11464 = getelementptr inbounds float* %tmp11463, i64 1
+  %tmp11465 = getelementptr inbounds float* %tmp11464, i64 1
+  %tmp11466 = getelementptr inbounds float* %tmp11465, i64 1
+  %tmp11467 = getelementptr inbounds float* %tmp11466, i64 1
+  %tmp11468 = getelementptr inbounds float* %tmp11467, i64 1
+  %tmp11469 = getelementptr inbounds float* %tmp11468, i64 1
+  %tmp11470 = getelementptr inbounds float* %tmp11469, i64 1
+  %tmp11471 = getelementptr inbounds float* %tmp11470, i64 1
+  %tmp11472 = getelementptr inbounds float* %tmp11471, i64 1
+  %tmp11473 = getelementptr inbounds float* %tmp11472, i64 1
+  %tmp11474 = getelementptr inbounds float* %tmp11473, i64 1
+  %tmp11475 = getelementptr inbounds float* %tmp11474, i64 1
+  %tmp11476 = getelementptr inbounds float* %tmp11475, i64 1
+  %tmp11477 = getelementptr inbounds float* %tmp11476, i64 1
+  %tmp11478 = getelementptr inbounds float* %tmp11477, i64 1
+  %tmp11479 = getelementptr inbounds float* %tmp11478, i64 1
+  %tmp11480 = getelementptr inbounds float* %tmp11479, i64 1
+  %tmp11481 = getelementptr inbounds float* %tmp11480, i64 1
+  %tmp11482 = getelementptr inbounds float* %tmp11481, i64 1
+  %tmp11483 = getelementptr inbounds float* %tmp11482, i64 1
+  %tmp11484 = getelementptr inbounds float* %tmp11483, i64 1
+  %tmp11485 = getelementptr inbounds float* %tmp11484, i64 1
+  %tmp11486 = getelementptr inbounds float* %tmp11485, i64 1
+  %tmp11487 = getelementptr inbounds float* %tmp11486, i64 1
+  %tmp11488 = getelementptr inbounds float* %tmp11487, i64 1
+  %tmp11489 = getelementptr inbounds float* %tmp11488, i64 1
+  %tmp11490 = getelementptr inbounds float* %tmp11489, i64 1
+  %tmp11491 = getelementptr inbounds float* %tmp11490, i64 1
+  %tmp11492 = getelementptr inbounds float* %tmp11491, i64 1
+  %tmp11493 = getelementptr inbounds float* %tmp11492, i64 1
+  %tmp11494 = getelementptr inbounds float* %tmp11493, i64 1
+  %tmp11495 = getelementptr inbounds float* %tmp11494, i64 1
+  %tmp11496 = getelementptr inbounds float* %tmp11495, i64 1
+  %tmp11497 = getelementptr inbounds float* %tmp11496, i64 1
+  %tmp11498 = getelementptr inbounds float* %tmp11497, i64 1
+  %tmp11499 = getelementptr inbounds float* %tmp11498, i64 1
+  %tmp11500 = getelementptr inbounds float* %tmp11499, i64 1
+  %tmp11501 = getelementptr inbounds float* %tmp11500, i64 1
+  %tmp11502 = getelementptr inbounds float* %tmp11501, i64 1
+  %tmp11503 = getelementptr inbounds float* %tmp11502, i64 1
+  %tmp11504 = getelementptr inbounds float* %tmp11503, i64 1
+  %tmp11505 = getelementptr inbounds float* %tmp11504, i64 1
+  %tmp11506 = getelementptr inbounds float* %tmp11505, i64 1
+  %tmp11507 = getelementptr inbounds float* %tmp11506, i64 1
+  %tmp11508 = getelementptr inbounds float* %tmp11507, i64 1
+  %tmp11509 = getelementptr inbounds float* %tmp11508, i64 1
+  %tmp11510 = getelementptr inbounds float* %tmp11509, i64 1
+  %tmp11511 = getelementptr inbounds float* %tmp11510, i64 1
+  %tmp11512 = getelementptr inbounds float* %tmp11511, i64 1
+  %tmp11513 = getelementptr inbounds float* %tmp11512, i64 1
+  %tmp11514 = getelementptr inbounds float* %tmp11513, i64 1
+  %tmp11515 = getelementptr inbounds float* %tmp11514, i64 1
+  %tmp11516 = getelementptr inbounds float* %tmp11515, i64 1
+  %tmp11517 = getelementptr inbounds float* %tmp11516, i64 1
+  %tmp11518 = getelementptr inbounds float* %tmp11517, i64 1
+  %tmp11519 = getelementptr inbounds float* %tmp11518, i64 1
+  %tmp11520 = getelementptr inbounds float* %tmp11519, i64 1
+  %tmp11521 = getelementptr inbounds float* %tmp11520, i64 1
+  %tmp11522 = getelementptr inbounds float* %tmp11521, i64 1
+  %tmp11523 = getelementptr inbounds float* %tmp11522, i64 1
+  %tmp11524 = getelementptr inbounds float* %tmp11523, i64 1
+  %tmp11525 = getelementptr inbounds float* %tmp11524, i64 1
+  %tmp11526 = getelementptr inbounds float* %tmp11525, i64 1
+  %tmp11527 = getelementptr inbounds float* %tmp11526, i64 1
+  %tmp11528 = getelementptr inbounds float* %tmp11527, i64 1
+  %tmp11529 = getelementptr inbounds float* %tmp11528, i64 1
+  %tmp11530 = getelementptr inbounds float* %tmp11529, i64 1
+  %tmp11531 = getelementptr inbounds float* %tmp11530, i64 1
+  %tmp11532 = getelementptr inbounds float* %tmp11531, i64 1
+  %tmp11533 = getelementptr inbounds float* %tmp11532, i64 1
+  %tmp11534 = getelementptr inbounds float* %tmp11533, i64 1
+  %tmp11535 = getelementptr inbounds float* %tmp11534, i64 1
+  %tmp11536 = getelementptr inbounds float* %tmp11535, i64 1
+  %tmp11537 = getelementptr inbounds float* %tmp11536, i64 1
+  %tmp11538 = getelementptr inbounds float* %tmp11537, i64 1
+  %tmp11539 = getelementptr inbounds float* %tmp11538, i64 1
+  %tmp11540 = getelementptr inbounds float* %tmp11539, i64 1
+  %tmp11541 = getelementptr inbounds float* %tmp11540, i64 1
+  %tmp11542 = getelementptr inbounds float* %tmp11541, i64 1
+  %tmp11543 = getelementptr inbounds float* %tmp11542, i64 1
+  %tmp11544 = getelementptr inbounds float* %tmp11543, i64 1
+  %tmp11545 = getelementptr inbounds float* %tmp11544, i64 1
+  %tmp11546 = getelementptr inbounds float* %tmp11545, i64 1
+  %tmp11547 = getelementptr inbounds float* %tmp11546, i64 1
+  %tmp11548 = getelementptr inbounds float* %tmp11547, i64 1
+  %tmp11549 = getelementptr inbounds float* %tmp11548, i64 1
+  %tmp11550 = getelementptr inbounds float* %tmp11549, i64 1
+  %tmp11551 = getelementptr inbounds float* %tmp11550, i64 1
+  %tmp11552 = getelementptr inbounds float* %tmp11551, i64 1
+  %tmp11553 = getelementptr inbounds float* %tmp11552, i64 1
+  %tmp11554 = getelementptr inbounds float* %tmp11553, i64 1
+  %tmp11555 = getelementptr inbounds float* %tmp11554, i64 1
+  %tmp11556 = getelementptr inbounds float* %tmp11555, i64 1
+  %tmp11557 = getelementptr inbounds float* %tmp11556, i64 1
+  %tmp11558 = getelementptr inbounds float* %tmp11557, i64 1
+  %tmp11559 = getelementptr inbounds float* %tmp11558, i64 1
+  %tmp11560 = getelementptr inbounds float* %tmp11559, i64 1
+  %tmp11561 = getelementptr inbounds float* %tmp11560, i64 1
+  %tmp11562 = getelementptr inbounds float* %tmp11561, i64 1
+  %tmp11563 = getelementptr inbounds float* %tmp11562, i64 1
+  %tmp11564 = getelementptr inbounds float* %tmp11563, i64 1
+  %tmp11565 = getelementptr inbounds float* %tmp11564, i64 1
+  %tmp11566 = getelementptr inbounds float* %tmp11565, i64 1
+  %tmp11567 = getelementptr inbounds float* %tmp11566, i64 1
+  %tmp11568 = getelementptr inbounds float* %tmp11567, i64 1
+  %tmp11569 = getelementptr inbounds float* %tmp11568, i64 1
+  %tmp11570 = getelementptr inbounds float* %tmp11569, i64 1
+  %tmp11571 = getelementptr inbounds float* %tmp11570, i64 1
+  %tmp11572 = getelementptr inbounds float* %tmp11571, i64 1
+  %tmp11573 = getelementptr inbounds float* %tmp11572, i64 1
+  %tmp11574 = getelementptr inbounds float* %tmp11573, i64 1
+  %tmp11575 = getelementptr inbounds float* %tmp11574, i64 1
+  %tmp11576 = getelementptr inbounds float* %tmp11575, i64 1
+  %tmp11577 = getelementptr inbounds float* %tmp11576, i64 1
+  %tmp11578 = getelementptr inbounds float* %tmp11577, i64 1
+  %tmp11579 = getelementptr inbounds float* %tmp11578, i64 1
+  %tmp11580 = getelementptr inbounds float* %tmp11579, i64 1
+  %tmp11581 = getelementptr inbounds float* %tmp11580, i64 1
+  %tmp11582 = getelementptr inbounds float* %tmp11581, i64 1
+  %tmp11583 = getelementptr inbounds float* %tmp11582, i64 1
+  %tmp11584 = getelementptr inbounds float* %tmp11583, i64 1
+  %tmp11585 = getelementptr inbounds float* %tmp11584, i64 1
+  %tmp11586 = getelementptr inbounds float* %tmp11585, i64 1
+  %tmp11587 = getelementptr inbounds float* %tmp11586, i64 1
+  %tmp11588 = getelementptr inbounds float* %tmp11587, i64 1
+  %tmp11589 = getelementptr inbounds float* %tmp11588, i64 1
+  %tmp11590 = getelementptr inbounds float* %tmp11589, i64 1
+  %tmp11591 = getelementptr inbounds float* %tmp11590, i64 1
+  %tmp11592 = getelementptr inbounds float* %tmp11591, i64 1
+  %tmp11593 = getelementptr inbounds float* %tmp11592, i64 1
+  %tmp11594 = getelementptr inbounds float* %tmp11593, i64 1
+  %tmp11595 = getelementptr inbounds float* %tmp11594, i64 1
+  %tmp11596 = getelementptr inbounds float* %tmp11595, i64 1
+  %tmp11597 = getelementptr inbounds float* %tmp11596, i64 1
+  %tmp11598 = getelementptr inbounds float* %tmp11597, i64 1
+  %tmp11599 = getelementptr inbounds float* %tmp11598, i64 1
+  %tmp11600 = getelementptr inbounds float* %tmp11599, i64 1
+  %tmp11601 = getelementptr inbounds float* %tmp11600, i64 1
+  %tmp11602 = getelementptr inbounds float* %tmp11601, i64 1
+  %tmp11603 = getelementptr inbounds float* %tmp11602, i64 1
+  %tmp11604 = getelementptr inbounds float* %tmp11603, i64 1
+  %tmp11605 = getelementptr inbounds float* %tmp11604, i64 1
+  %tmp11606 = getelementptr inbounds float* %tmp11605, i64 1
+  %tmp11607 = getelementptr inbounds float* %tmp11606, i64 1
+  %tmp11608 = getelementptr inbounds float* %tmp11607, i64 1
+  %tmp11609 = getelementptr inbounds float* %tmp11608, i64 1
+  %tmp11610 = getelementptr inbounds float* %tmp11609, i64 1
+  %tmp11611 = getelementptr inbounds float* %tmp11610, i64 1
+  %tmp11612 = getelementptr inbounds float* %tmp11611, i64 1
+  %tmp11613 = getelementptr inbounds float* %tmp11612, i64 1
+  %tmp11614 = getelementptr inbounds float* %tmp11613, i64 1
+  %tmp11615 = getelementptr inbounds float* %tmp11614, i64 1
+  %tmp11616 = getelementptr inbounds float* %tmp11615, i64 1
+  %tmp11617 = getelementptr inbounds float* %tmp11616, i64 1
+  %tmp11618 = getelementptr inbounds float* %tmp11617, i64 1
+  %tmp11619 = getelementptr inbounds float* %tmp11618, i64 1
+  %tmp11620 = getelementptr inbounds float* %tmp11619, i64 1
+  %tmp11621 = getelementptr inbounds float* %tmp11620, i64 1
+  %tmp11622 = getelementptr inbounds float* %tmp11621, i64 1
+  %tmp11623 = getelementptr inbounds float* %tmp11622, i64 1
+  %tmp11624 = getelementptr inbounds float* %tmp11623, i64 1
+  %tmp11625 = getelementptr inbounds float* %tmp11624, i64 1
+  %tmp11626 = getelementptr inbounds float* %tmp11625, i64 1
+  %tmp11627 = getelementptr inbounds float* %tmp11626, i64 1
+  %tmp11628 = getelementptr inbounds float* %tmp11627, i64 1
+  %tmp11629 = getelementptr inbounds float* %tmp11628, i64 1
+  %tmp11630 = getelementptr inbounds float* %tmp11629, i64 1
+  %tmp11631 = getelementptr inbounds float* %tmp11630, i64 1
+  %tmp11632 = getelementptr inbounds float* %tmp11631, i64 1
+  %tmp11633 = getelementptr inbounds float* %tmp11632, i64 1
+  %tmp11634 = getelementptr inbounds float* %tmp11633, i64 1
+  %tmp11635 = getelementptr inbounds float* %tmp11634, i64 1
+  %tmp11636 = getelementptr inbounds float* %tmp11635, i64 1
+  %tmp11637 = getelementptr inbounds float* %tmp11636, i64 1
+  %tmp11638 = getelementptr inbounds float* %tmp11637, i64 1
+  %tmp11639 = getelementptr inbounds float* %tmp11638, i64 1
+  %tmp11640 = getelementptr inbounds float* %tmp11639, i64 1
+  %tmp11641 = getelementptr inbounds float* %tmp11640, i64 1
+  %tmp11642 = getelementptr inbounds float* %tmp11641, i64 1
+  %tmp11643 = getelementptr inbounds float* %tmp11642, i64 1
+  %tmp11644 = getelementptr inbounds float* %tmp11643, i64 1
+  %tmp11645 = getelementptr inbounds float* %tmp11644, i64 1
+  %tmp11646 = getelementptr inbounds float* %tmp11645, i64 1
+  %tmp11647 = getelementptr inbounds float* %tmp11646, i64 1
+  %tmp11648 = getelementptr inbounds float* %tmp11647, i64 1
+  %tmp11649 = getelementptr inbounds float* %tmp11648, i64 1
+  %tmp11650 = getelementptr inbounds float* %tmp11649, i64 1
+  %tmp11651 = getelementptr inbounds float* %tmp11650, i64 1
+  %tmp11652 = getelementptr inbounds float* %tmp11651, i64 1
+  %tmp11653 = getelementptr inbounds float* %tmp11652, i64 1
+  %tmp11654 = getelementptr inbounds float* %tmp11653, i64 1
+  %tmp11655 = getelementptr inbounds float* %tmp11654, i64 1
+  %tmp11656 = getelementptr inbounds float* %tmp11655, i64 1
+  %tmp11657 = getelementptr inbounds float* %tmp11656, i64 1
+  %tmp11658 = getelementptr inbounds float* %tmp11657, i64 1
+  %tmp11659 = getelementptr inbounds float* %tmp11658, i64 1
+  %tmp11660 = getelementptr inbounds float* %tmp11659, i64 1
+  %tmp11661 = getelementptr inbounds float* %tmp11660, i64 1
+  %tmp11662 = getelementptr inbounds float* %tmp11661, i64 1
+  %tmp11663 = getelementptr inbounds float* %tmp11662, i64 1
+  %tmp11664 = getelementptr inbounds float* %tmp11663, i64 1
+  %tmp11665 = getelementptr inbounds float* %tmp11664, i64 1
+  %tmp11666 = getelementptr inbounds float* %tmp11665, i64 1
+  %tmp11667 = getelementptr inbounds float* %tmp11666, i64 1
+  %tmp11668 = getelementptr inbounds float* %tmp11667, i64 1
+  %tmp11669 = getelementptr inbounds float* %tmp11668, i64 1
+  %tmp11670 = getelementptr inbounds float* %tmp11669, i64 1
+  %tmp11671 = getelementptr inbounds float* %tmp11670, i64 1
+  %tmp11672 = getelementptr inbounds float* %tmp11671, i64 1
+  %tmp11673 = getelementptr inbounds float* %tmp11672, i64 1
+  %tmp11674 = getelementptr inbounds float* %tmp11673, i64 1
+  %tmp11675 = getelementptr inbounds float* %tmp11674, i64 1
+  %tmp11676 = getelementptr inbounds float* %tmp11675, i64 1
+  %tmp11677 = getelementptr inbounds float* %tmp11676, i64 1
+  %tmp11678 = getelementptr inbounds float* %tmp11677, i64 1
+  %tmp11679 = getelementptr inbounds float* %tmp11678, i64 1
+  %tmp11680 = getelementptr inbounds float* %tmp11679, i64 1
+  %tmp11681 = getelementptr inbounds float* %tmp11680, i64 1
+  %tmp11682 = getelementptr inbounds float* %tmp11681, i64 1
+  %tmp11683 = getelementptr inbounds float* %tmp11682, i64 1
+  %tmp11684 = getelementptr inbounds float* %tmp11683, i64 1
+  %tmp11685 = getelementptr inbounds float* %tmp11684, i64 1
+  %tmp11686 = getelementptr inbounds float* %tmp11685, i64 1
+  %tmp11687 = getelementptr inbounds float* %tmp11686, i64 1
+  %tmp11688 = getelementptr inbounds float* %tmp11687, i64 1
+  %tmp11689 = getelementptr inbounds float* %tmp11688, i64 1
+  %tmp11690 = getelementptr inbounds float* %tmp11689, i64 1
+  %tmp11691 = getelementptr inbounds float* %tmp11690, i64 1
+  %tmp11692 = getelementptr inbounds float* %tmp11691, i64 1
+  %tmp11693 = getelementptr inbounds float* %tmp11692, i64 1
+  %tmp11694 = getelementptr inbounds float* %tmp11693, i64 1
+  %tmp11695 = getelementptr inbounds float* %tmp11694, i64 1
+  %tmp11696 = getelementptr inbounds float* %tmp11695, i64 1
+  %tmp11697 = getelementptr inbounds float* %tmp11696, i64 1
+  %tmp11698 = getelementptr inbounds float* %tmp11697, i64 1
+  %tmp11699 = getelementptr inbounds float* %tmp11698, i64 1
+  %tmp11700 = getelementptr inbounds float* %tmp11699, i64 1
+  %tmp11701 = getelementptr inbounds float* %tmp11700, i64 1
+  %tmp11702 = getelementptr inbounds float* %tmp11701, i64 1
+  %tmp11703 = getelementptr inbounds float* %tmp11702, i64 1
+  %tmp11704 = getelementptr inbounds float* %tmp11703, i64 1
+  %tmp11705 = getelementptr inbounds float* %tmp11704, i64 1
+  %tmp11706 = getelementptr inbounds float* %tmp11705, i64 1
+  %tmp11707 = getelementptr inbounds float* %tmp11706, i64 1
+  %tmp11708 = getelementptr inbounds float* %tmp11707, i64 1
+  %tmp11709 = getelementptr inbounds float* %tmp11708, i64 1
+  %tmp11710 = getelementptr inbounds float* %tmp11709, i64 1
+  %tmp11711 = getelementptr inbounds float* %tmp11710, i64 1
+  %tmp11712 = getelementptr inbounds float* %tmp11711, i64 1
+  %tmp11713 = getelementptr inbounds float* %tmp11712, i64 1
+  %tmp11714 = getelementptr inbounds float* %tmp11713, i64 1
+  %tmp11715 = getelementptr inbounds float* %tmp11714, i64 1
+  %tmp11716 = getelementptr inbounds float* %tmp11715, i64 1
+  %tmp11717 = getelementptr inbounds float* %tmp11716, i64 1
+  %tmp11718 = getelementptr inbounds float* %tmp11717, i64 1
+  %tmp11719 = getelementptr inbounds float* %tmp11718, i64 1
+  %tmp11720 = getelementptr inbounds float* %tmp11719, i64 1
+  %tmp11721 = getelementptr inbounds float* %tmp11720, i64 1
+  %tmp11722 = getelementptr inbounds float* %tmp11721, i64 1
+  %tmp11723 = getelementptr inbounds float* %tmp11722, i64 1
+  %tmp11724 = getelementptr inbounds float* %tmp11723, i64 1
+  %tmp11725 = getelementptr inbounds float* %tmp11724, i64 1
+  %tmp11726 = getelementptr inbounds float* %tmp11725, i64 1
+  %tmp11727 = getelementptr inbounds float* %tmp11726, i64 1
+  %tmp11728 = getelementptr inbounds float* %tmp11727, i64 1
+  %tmp11729 = getelementptr inbounds float* %tmp11728, i64 1
+  %tmp11730 = getelementptr inbounds float* %tmp11729, i64 1
+  %tmp11731 = getelementptr inbounds float* %tmp11730, i64 1
+  %tmp11732 = getelementptr inbounds float* %tmp11731, i64 1
+  %tmp11733 = getelementptr inbounds float* %tmp11732, i64 1
+  %tmp11734 = getelementptr inbounds float* %tmp11733, i64 1
+  %tmp11735 = getelementptr inbounds float* %tmp11734, i64 1
+  %tmp11736 = getelementptr inbounds float* %tmp11735, i64 1
+  %tmp11737 = getelementptr inbounds float* %tmp11736, i64 1
+  %tmp11738 = getelementptr inbounds float* %tmp11737, i64 1
+  %tmp11739 = getelementptr inbounds float* %tmp11738, i64 1
+  %tmp11740 = getelementptr inbounds float* %tmp11739, i64 1
+  %tmp11741 = getelementptr inbounds float* %tmp11740, i64 1
+  %tmp11742 = getelementptr inbounds float* %tmp11741, i64 1
+  %tmp11743 = getelementptr inbounds float* %tmp11742, i64 1
+  %tmp11744 = getelementptr inbounds float* %tmp11743, i64 1
+  %tmp11745 = getelementptr inbounds float* %tmp11744, i64 1
+  %tmp11746 = getelementptr inbounds float* %tmp11745, i64 1
+  %tmp11747 = getelementptr inbounds float* %tmp11746, i64 1
+  %tmp11748 = getelementptr inbounds float* %tmp11747, i64 1
+  %tmp11749 = getelementptr inbounds float* %tmp11748, i64 1
+  %tmp11750 = getelementptr inbounds float* %tmp11749, i64 1
+  %tmp11751 = getelementptr inbounds float* %tmp11750, i64 1
+  %tmp11752 = getelementptr inbounds float* %tmp11751, i64 1
+  %tmp11753 = getelementptr inbounds float* %tmp11752, i64 1
+  %tmp11754 = getelementptr inbounds float* %tmp11753, i64 1
+  %tmp11755 = getelementptr inbounds float* %tmp11754, i64 1
+  %tmp11756 = getelementptr inbounds float* %tmp11755, i64 1
+  %tmp11757 = getelementptr inbounds float* %tmp11756, i64 1
+  %tmp11758 = getelementptr inbounds float* %tmp11757, i64 1
+  %tmp11759 = getelementptr inbounds float* %tmp11758, i64 1
+  %tmp11760 = getelementptr inbounds float* %tmp11759, i64 1
+  %tmp11761 = getelementptr inbounds float* %tmp11760, i64 1
+  %tmp11762 = getelementptr inbounds float* %tmp11761, i64 1
+  %tmp11763 = getelementptr inbounds float* %tmp11762, i64 1
+  %tmp11764 = getelementptr inbounds float* %tmp11763, i64 1
+  %tmp11765 = getelementptr inbounds float* %tmp11764, i64 1
+  %tmp11766 = getelementptr inbounds float* %tmp11765, i64 1
+  %tmp11767 = getelementptr inbounds float* %tmp11766, i64 1
+  %tmp11768 = getelementptr inbounds float* %tmp11767, i64 1
+  %tmp11769 = getelementptr inbounds float* %tmp11768, i64 1
+  %tmp11770 = getelementptr inbounds float* %tmp11769, i64 1
+  %tmp11771 = getelementptr inbounds float* %tmp11770, i64 1
+  %tmp11772 = getelementptr inbounds float* %tmp11771, i64 1
+  %tmp11773 = getelementptr inbounds float* %tmp11772, i64 1
+  %tmp11774 = getelementptr inbounds float* %tmp11773, i64 1
+  %tmp11775 = getelementptr inbounds float* %tmp11774, i64 1
+  %tmp11776 = getelementptr inbounds float* %tmp11775, i64 1
+  %tmp11777 = getelementptr inbounds float* %tmp11776, i64 1
+  %tmp11778 = getelementptr inbounds float* %tmp11777, i64 1
+  %tmp11779 = getelementptr inbounds float* %tmp11778, i64 1
+  %tmp11780 = getelementptr inbounds float* %tmp11779, i64 1
+  %tmp11781 = getelementptr inbounds float* %tmp11780, i64 1
+  %tmp11782 = getelementptr inbounds float* %tmp11781, i64 1
+  %tmp11783 = getelementptr inbounds float* %tmp11782, i64 1
+  %tmp11784 = getelementptr inbounds float* %tmp11783, i64 1
+  %tmp11785 = getelementptr inbounds float* %tmp11784, i64 1
+  %tmp11786 = getelementptr inbounds float* %tmp11785, i64 1
+  %tmp11787 = getelementptr inbounds float* %tmp11786, i64 1
+  %tmp11788 = getelementptr inbounds float* %tmp11787, i64 1
+  %tmp11789 = getelementptr inbounds float* %tmp11788, i64 1
+  %tmp11790 = getelementptr inbounds float* %tmp11789, i64 1
+  %tmp11791 = getelementptr inbounds float* %tmp11790, i64 1
+  %tmp11792 = getelementptr inbounds float* %tmp11791, i64 1
+  %tmp11793 = getelementptr inbounds float* %tmp11792, i64 1
+  %tmp11794 = getelementptr inbounds float* %tmp11793, i64 1
+  %tmp11795 = getelementptr inbounds float* %tmp11794, i64 1
+  %tmp11796 = getelementptr inbounds float* %tmp11795, i64 1
+  %tmp11797 = getelementptr inbounds float* %tmp11796, i64 1
+  %tmp11798 = getelementptr inbounds float* %tmp11797, i64 1
+  %tmp11799 = getelementptr inbounds float* %tmp11798, i64 1
+  %tmp11800 = getelementptr inbounds float* %tmp11799, i64 1
+  %tmp11801 = getelementptr inbounds float* %tmp11800, i64 1
+  %tmp11802 = getelementptr inbounds float* %tmp11801, i64 1
+  %tmp11803 = getelementptr inbounds float* %tmp11802, i64 1
+  %tmp11804 = getelementptr inbounds float* %tmp11803, i64 1
+  %tmp11805 = getelementptr inbounds float* %tmp11804, i64 1
+  %tmp11806 = getelementptr inbounds float* %tmp11805, i64 1
+  %tmp11807 = getelementptr inbounds float* %tmp11806, i64 1
+  %tmp11808 = getelementptr inbounds float* %tmp11807, i64 1
+  %tmp11809 = getelementptr inbounds float* %tmp11808, i64 1
+  %tmp11810 = getelementptr inbounds float* %tmp11809, i64 1
+  %tmp11811 = getelementptr inbounds float* %tmp11810, i64 1
+  %tmp11812 = getelementptr inbounds float* %tmp11811, i64 1
+  %tmp11813 = getelementptr inbounds float* %tmp11812, i64 1
+  %tmp11814 = getelementptr inbounds float* %tmp11813, i64 1
+  %tmp11815 = getelementptr inbounds float* %tmp11814, i64 1
+  %tmp11816 = getelementptr inbounds float* %tmp11815, i64 1
+  %tmp11817 = getelementptr inbounds float* %tmp11816, i64 1
+  %tmp11818 = getelementptr inbounds float* %tmp11817, i64 1
+  %tmp11819 = getelementptr inbounds float* %tmp11818, i64 1
+  %tmp11820 = getelementptr inbounds float* %tmp11819, i64 1
+  %tmp11821 = getelementptr inbounds float* %tmp11820, i64 1
+  %tmp11822 = getelementptr inbounds float* %tmp11821, i64 1
+  %tmp11823 = getelementptr inbounds float* %tmp11822, i64 1
+  %tmp11824 = getelementptr inbounds float* %tmp11823, i64 1
+  %tmp11825 = getelementptr inbounds float* %tmp11824, i64 1
+  %tmp11826 = getelementptr inbounds float* %tmp11825, i64 1
+  %tmp11827 = getelementptr inbounds float* %tmp11826, i64 1
+  %tmp11828 = getelementptr inbounds float* %tmp11827, i64 1
+  %tmp11829 = getelementptr inbounds float* %tmp11828, i64 1
+  %tmp11830 = getelementptr inbounds float* %tmp11829, i64 1
+  %tmp11831 = getelementptr inbounds float* %tmp11830, i64 1
+  %tmp11832 = getelementptr inbounds float* %tmp11831, i64 1
+  %tmp11833 = getelementptr inbounds float* %tmp11832, i64 1
+  %tmp11834 = getelementptr inbounds float* %tmp11833, i64 1
+  %tmp11835 = getelementptr inbounds float* %tmp11834, i64 1
+  %tmp11836 = getelementptr inbounds float* %tmp11835, i64 1
+  %tmp11837 = getelementptr inbounds float* %tmp11836, i64 1
+  %tmp11838 = getelementptr inbounds float* %tmp11837, i64 1
+  %tmp11839 = getelementptr inbounds float* %tmp11838, i64 1
+  %tmp11840 = getelementptr inbounds float* %tmp11839, i64 1
+  %tmp11841 = getelementptr inbounds float* %tmp11840, i64 1
+  %tmp11842 = getelementptr inbounds float* %tmp11841, i64 1
+  %tmp11843 = getelementptr inbounds float* %tmp11842, i64 1
+  %tmp11844 = getelementptr inbounds float* %tmp11843, i64 1
+  %tmp11845 = getelementptr inbounds float* %tmp11844, i64 1
+  %tmp11846 = getelementptr inbounds float* %tmp11845, i64 1
+  %tmp11847 = getelementptr inbounds float* %tmp11846, i64 1
+  %tmp11848 = getelementptr inbounds float* %tmp11847, i64 1
+  %tmp11849 = getelementptr inbounds float* %tmp11848, i64 1
+  %tmp11850 = getelementptr inbounds float* %tmp11849, i64 1
+  %tmp11851 = getelementptr inbounds float* %tmp11850, i64 1
+  %tmp11852 = getelementptr inbounds float* %tmp11851, i64 1
+  %tmp11853 = getelementptr inbounds float* %tmp11852, i64 1
+  %tmp11854 = getelementptr inbounds float* %tmp11853, i64 1
+  %tmp11855 = getelementptr inbounds float* %tmp11854, i64 1
+  %tmp11856 = getelementptr inbounds float* %tmp11855, i64 1
+  %tmp11857 = getelementptr inbounds float* %tmp11856, i64 1
+  %tmp11858 = getelementptr inbounds float* %tmp11857, i64 1
+  %tmp11859 = getelementptr inbounds float* %tmp11858, i64 1
+  %tmp11860 = getelementptr inbounds float* %tmp11859, i64 1
+  %tmp11861 = getelementptr inbounds float* %tmp11860, i64 1
+  %tmp11862 = getelementptr inbounds float* %tmp11861, i64 1
+  %tmp11863 = getelementptr inbounds float* %tmp11862, i64 1
+  %tmp11864 = getelementptr inbounds float* %tmp11863, i64 1
+  %tmp11865 = getelementptr inbounds float* %tmp11864, i64 1
+  %tmp11866 = getelementptr inbounds float* %tmp11865, i64 1
+  %tmp11867 = getelementptr inbounds float* %tmp11866, i64 1
+  %tmp11868 = getelementptr inbounds float* %tmp11867, i64 1
+  %tmp11869 = getelementptr inbounds float* %tmp11868, i64 1
+  %tmp11870 = getelementptr inbounds float* %tmp11869, i64 1
+  %tmp11871 = getelementptr inbounds float* %tmp11870, i64 1
+  %tmp11872 = getelementptr inbounds float* %tmp11871, i64 1
+  %tmp11873 = getelementptr inbounds float* %tmp11872, i64 1
+  %tmp11874 = getelementptr inbounds float* %tmp11873, i64 1
+  %tmp11875 = getelementptr inbounds float* %tmp11874, i64 1
+  %tmp11876 = getelementptr inbounds float* %tmp11875, i64 1
+  %tmp11877 = getelementptr inbounds float* %tmp11876, i64 1
+  %tmp11878 = getelementptr inbounds float* %tmp11877, i64 1
+  %tmp11879 = getelementptr inbounds float* %tmp11878, i64 1
+  %tmp11880 = getelementptr inbounds float* %tmp11879, i64 1
+  %tmp11881 = getelementptr inbounds float* %tmp11880, i64 1
+  %tmp11882 = getelementptr inbounds float* %tmp11881, i64 1
+  %tmp11883 = getelementptr inbounds float* %tmp11882, i64 1
+  %tmp11884 = getelementptr inbounds float* %tmp11883, i64 1
+  %tmp11885 = getelementptr inbounds float* %tmp11884, i64 1
+  %tmp11886 = getelementptr inbounds float* %tmp11885, i64 1
+  %tmp11887 = getelementptr inbounds float* %tmp11886, i64 1
+  %tmp11888 = getelementptr inbounds float* %tmp11887, i64 1
+  %tmp11889 = getelementptr inbounds float* %tmp11888, i64 1
+  %tmp11890 = getelementptr inbounds float* %tmp11889, i64 1
+  %tmp11891 = getelementptr inbounds float* %tmp11890, i64 1
+  %tmp11892 = getelementptr inbounds float* %tmp11891, i64 1
+  %tmp11893 = getelementptr inbounds float* %tmp11892, i64 1
+  %tmp11894 = getelementptr inbounds float* %tmp11893, i64 1
+  %tmp11895 = getelementptr inbounds float* %tmp11894, i64 1
+  %tmp11896 = getelementptr inbounds float* %tmp11895, i64 1
+  %tmp11897 = getelementptr inbounds float* %tmp11896, i64 1
+  %tmp11898 = getelementptr inbounds float* %tmp11897, i64 1
+  %tmp11899 = getelementptr inbounds float* %tmp11898, i64 1
+  %tmp11900 = getelementptr inbounds float* %tmp11899, i64 1
+  %tmp11901 = getelementptr inbounds float* %tmp11900, i64 1
+  %tmp11902 = getelementptr inbounds float* %tmp11901, i64 1
+  %tmp11903 = getelementptr inbounds float* %tmp11902, i64 1
+  %tmp11904 = getelementptr inbounds float* %tmp11903, i64 1
+  %tmp11905 = getelementptr inbounds float* %tmp11904, i64 1
+  %tmp11906 = getelementptr inbounds float* %tmp11905, i64 1
+  %tmp11907 = getelementptr inbounds float* %tmp11906, i64 1
+  %tmp11908 = getelementptr inbounds float* %tmp11907, i64 1
+  %tmp11909 = getelementptr inbounds float* %tmp11908, i64 1
+  %tmp11910 = getelementptr inbounds float* %tmp11909, i64 1
+  %tmp11911 = getelementptr inbounds float* %tmp11910, i64 1
+  %tmp11912 = getelementptr inbounds float* %tmp11911, i64 1
+  %tmp11913 = getelementptr inbounds float* %tmp11912, i64 1
+  %tmp11914 = getelementptr inbounds float* %tmp11913, i64 1
+  %tmp11915 = getelementptr inbounds float* %tmp11914, i64 1
+  %tmp11916 = getelementptr inbounds float* %tmp11915, i64 1
+  %tmp11917 = getelementptr inbounds float* %tmp11916, i64 1
+  %tmp11918 = getelementptr inbounds float* %tmp11917, i64 1
+  %tmp11919 = getelementptr inbounds float* %tmp11918, i64 1
+  %tmp11920 = getelementptr inbounds float* %tmp11919, i64 1
+  %tmp11921 = getelementptr inbounds float* %tmp11920, i64 1
+  %tmp11922 = getelementptr inbounds float* %tmp11921, i64 1
+  %tmp11923 = getelementptr inbounds float* %tmp11922, i64 1
+  %tmp11924 = getelementptr inbounds float* %tmp11923, i64 1
+  %tmp11925 = getelementptr inbounds float* %tmp11924, i64 1
+  %tmp11926 = getelementptr inbounds float* %tmp11925, i64 1
+  %tmp11927 = getelementptr inbounds float* %tmp11926, i64 1
+  %tmp11928 = getelementptr inbounds float* %tmp11927, i64 1
+  %tmp11929 = getelementptr inbounds float* %tmp11928, i64 1
+  %tmp11930 = getelementptr inbounds float* %tmp11929, i64 1
+  %tmp11931 = getelementptr inbounds float* %tmp11930, i64 1
+  %tmp11932 = getelementptr inbounds float* %tmp11931, i64 1
+  %tmp11933 = getelementptr inbounds float* %tmp11932, i64 1
+  %tmp11934 = getelementptr inbounds float* %tmp11933, i64 1
+  %tmp11935 = getelementptr inbounds float* %tmp11934, i64 1
+  %tmp11936 = getelementptr inbounds float* %tmp11935, i64 1
+  %tmp11937 = getelementptr inbounds float* %tmp11936, i64 1
+  %tmp11938 = getelementptr inbounds float* %tmp11937, i64 1
+  %tmp11939 = getelementptr inbounds float* %tmp11938, i64 1
+  %tmp11940 = getelementptr inbounds float* %tmp11939, i64 1
+  %tmp11941 = getelementptr inbounds float* %tmp11940, i64 1
+  %tmp11942 = getelementptr inbounds float* %tmp11941, i64 1
+  %tmp11943 = getelementptr inbounds float* %tmp11942, i64 1
+  %tmp11944 = getelementptr inbounds float* %tmp11943, i64 1
+  %tmp11945 = getelementptr inbounds float* %tmp11944, i64 1
+  %tmp11946 = getelementptr inbounds float* %tmp11945, i64 1
+  %tmp11947 = getelementptr inbounds float* %tmp11946, i64 1
+  %tmp11948 = getelementptr inbounds float* %tmp11947, i64 1
+  %tmp11949 = getelementptr inbounds float* %tmp11948, i64 1
+  %tmp11950 = getelementptr inbounds float* %tmp11949, i64 1
+  %tmp11951 = getelementptr inbounds float* %tmp11950, i64 1
+  %tmp11952 = getelementptr inbounds float* %tmp11951, i64 1
+  %tmp11953 = getelementptr inbounds float* %tmp11952, i64 1
+  %tmp11954 = getelementptr inbounds float* %tmp11953, i64 1
+  %tmp11955 = getelementptr inbounds float* %tmp11954, i64 1
+  %tmp11956 = getelementptr inbounds float* %tmp11955, i64 1
+  %tmp11957 = getelementptr inbounds float* %tmp11956, i64 1
+  %tmp11958 = getelementptr inbounds float* %tmp11957, i64 1
+  %tmp11959 = getelementptr inbounds float* %tmp11958, i64 1
+  %tmp11960 = getelementptr inbounds float* %tmp11959, i64 1
+  %tmp11961 = getelementptr inbounds float* %tmp11960, i64 1
+  %tmp11962 = getelementptr inbounds float* %tmp11961, i64 1
+  %tmp11963 = getelementptr inbounds float* %tmp11962, i64 1
+  %tmp11964 = getelementptr inbounds float* %tmp11963, i64 1
+  %tmp11965 = getelementptr inbounds float* %tmp11964, i64 1
+  %tmp11966 = getelementptr inbounds float* %tmp11965, i64 1
+  %tmp11967 = getelementptr inbounds float* %tmp11966, i64 1
+  %tmp11968 = getelementptr inbounds float* %tmp11967, i64 1
+  %tmp11969 = getelementptr inbounds float* %tmp11968, i64 1
+  %tmp11970 = getelementptr inbounds float* %tmp11969, i64 1
+  %tmp11971 = getelementptr inbounds float* %tmp11970, i64 1
+  %tmp11972 = getelementptr inbounds float* %tmp11971, i64 1
+  %tmp11973 = getelementptr inbounds float* %tmp11972, i64 1
+  %tmp11974 = getelementptr inbounds float* %tmp11973, i64 1
+  %tmp11975 = getelementptr inbounds float* %tmp11974, i64 1
+  %tmp11976 = getelementptr inbounds float* %tmp11975, i64 1
+  %tmp11977 = getelementptr inbounds float* %tmp11976, i64 1
+  %tmp11978 = getelementptr inbounds float* %tmp11977, i64 1
+  %tmp11979 = getelementptr inbounds float* %tmp11978, i64 1
+  %tmp11980 = getelementptr inbounds float* %tmp11979, i64 1
+  %tmp11981 = getelementptr inbounds float* %tmp11980, i64 1
+  %tmp11982 = getelementptr inbounds float* %tmp11981, i64 1
+  %tmp11983 = getelementptr inbounds float* %tmp11982, i64 1
+  %tmp11984 = getelementptr inbounds float* %tmp11983, i64 1
+  %tmp11985 = getelementptr inbounds float* %tmp11984, i64 1
+  %tmp11986 = getelementptr inbounds float* %tmp11985, i64 1
+  %tmp11987 = getelementptr inbounds float* %tmp11986, i64 1
+  %tmp11988 = getelementptr inbounds float* %tmp11987, i64 1
+  %tmp11989 = getelementptr inbounds float* %tmp11988, i64 1
+  %tmp11990 = getelementptr inbounds float* %tmp11989, i64 1
+  %tmp11991 = getelementptr inbounds float* %tmp11990, i64 1
+  %tmp11992 = getelementptr inbounds float* %tmp11991, i64 1
+  %tmp11993 = getelementptr inbounds float* %tmp11992, i64 1
+  %tmp11994 = getelementptr inbounds float* %tmp11993, i64 1
+  %tmp11995 = getelementptr inbounds float* %tmp11994, i64 1
+  %tmp11996 = getelementptr inbounds float* %tmp11995, i64 1
+  %tmp11997 = getelementptr inbounds float* %tmp11996, i64 1
+  %tmp11998 = getelementptr inbounds float* %tmp11997, i64 1
+  %tmp11999 = getelementptr inbounds float* %tmp11998, i64 1
+  %tmp12000 = getelementptr inbounds float* %tmp11999, i64 1
+  %tmp12001 = getelementptr inbounds float* %tmp12000, i64 1
+  %tmp12002 = getelementptr inbounds float* %tmp12001, i64 1
+  %tmp12003 = getelementptr inbounds float* %tmp12002, i64 1
+  %tmp12004 = getelementptr inbounds float* %tmp12003, i64 1
+  %tmp12005 = getelementptr inbounds float* %tmp12004, i64 1
+  %tmp12006 = getelementptr inbounds float* %tmp12005, i64 1
+  %tmp12007 = getelementptr inbounds float* %tmp12006, i64 1
+  %tmp12008 = getelementptr inbounds float* %tmp12007, i64 1
+  %tmp12009 = getelementptr inbounds float* %tmp12008, i64 1
+  %tmp12010 = getelementptr inbounds float* %tmp12009, i64 1
+  %tmp12011 = getelementptr inbounds float* %tmp12010, i64 1
+  %tmp12012 = getelementptr inbounds float* %tmp12011, i64 1
+  %tmp12013 = getelementptr inbounds float* %tmp12012, i64 1
+  %tmp12014 = getelementptr inbounds float* %tmp12013, i64 1
+  %tmp12015 = getelementptr inbounds float* %tmp12014, i64 1
+  %tmp12016 = getelementptr inbounds float* %tmp12015, i64 1
+  %tmp12017 = getelementptr inbounds float* %tmp12016, i64 1
+  %tmp12018 = getelementptr inbounds float* %tmp12017, i64 1
+  %tmp12019 = getelementptr inbounds float* %tmp12018, i64 1
+  %tmp12020 = getelementptr inbounds float* %tmp12019, i64 1
+  %tmp12021 = getelementptr inbounds float* %tmp12020, i64 1
+  %tmp12022 = getelementptr inbounds float* %tmp12021, i64 1
+  %tmp12023 = getelementptr inbounds float* %tmp12022, i64 1
+  %tmp12024 = getelementptr inbounds float* %tmp12023, i64 1
+  %tmp12025 = getelementptr inbounds float* %tmp12024, i64 1
+  %tmp12026 = getelementptr inbounds float* %tmp12025, i64 1
+  %tmp12027 = getelementptr inbounds float* %tmp12026, i64 1
+  %tmp12028 = getelementptr inbounds float* %tmp12027, i64 1
+  %tmp12029 = getelementptr inbounds float* %tmp12028, i64 1
+  %tmp12030 = getelementptr inbounds float* %tmp12029, i64 1
+  %tmp12031 = getelementptr inbounds float* %tmp12030, i64 1
+  %tmp12032 = getelementptr inbounds float* %tmp12031, i64 1
+  %tmp12033 = getelementptr inbounds float* %tmp12032, i64 1
+  %tmp12034 = getelementptr inbounds float* %tmp12033, i64 1
+  %tmp12035 = getelementptr inbounds float* %tmp12034, i64 1
+  %tmp12036 = getelementptr inbounds float* %tmp12035, i64 1
+  %tmp12037 = getelementptr inbounds float* %tmp12036, i64 1
+  %tmp12038 = getelementptr inbounds float* %tmp12037, i64 1
+  %tmp12039 = getelementptr inbounds float* %tmp12038, i64 1
+  %tmp12040 = getelementptr inbounds float* %tmp12039, i64 1
+  %tmp12041 = getelementptr inbounds float* %tmp12040, i64 1
+  %tmp12042 = getelementptr inbounds float* %tmp12041, i64 1
+  %tmp12043 = getelementptr inbounds float* %tmp12042, i64 1
+  %tmp12044 = getelementptr inbounds float* %tmp12043, i64 1
+  %tmp12045 = getelementptr inbounds float* %tmp12044, i64 1
+  %tmp12046 = getelementptr inbounds float* %tmp12045, i64 1
+  %tmp12047 = getelementptr inbounds float* %tmp12046, i64 1
+  %tmp12048 = getelementptr inbounds float* %tmp12047, i64 1
+  %tmp12049 = getelementptr inbounds float* %tmp12048, i64 1
+  %tmp12050 = getelementptr inbounds float* %tmp12049, i64 1
+  %tmp12051 = getelementptr inbounds float* %tmp12050, i64 1
+  %tmp12052 = getelementptr inbounds float* %tmp12051, i64 1
+  %tmp12053 = getelementptr inbounds float* %tmp12052, i64 1
+  %tmp12054 = getelementptr inbounds float* %tmp12053, i64 1
+  %tmp12055 = getelementptr inbounds float* %tmp12054, i64 1
+  %tmp12056 = getelementptr inbounds float* %tmp12055, i64 1
+  %tmp12057 = getelementptr inbounds float* %tmp12056, i64 1
+  %tmp12058 = getelementptr inbounds float* %tmp12057, i64 1
+  %tmp12059 = getelementptr inbounds float* %tmp12058, i64 1
+  %tmp12060 = getelementptr inbounds float* %tmp12059, i64 1
+  %tmp12061 = getelementptr inbounds float* %tmp12060, i64 1
+  %tmp12062 = getelementptr inbounds float* %tmp12061, i64 1
+  %tmp12063 = getelementptr inbounds float* %tmp12062, i64 1
+  %tmp12064 = getelementptr inbounds float* %tmp12063, i64 1
+  %tmp12065 = getelementptr inbounds float* %tmp12064, i64 1
+  %tmp12066 = getelementptr inbounds float* %tmp12065, i64 1
+  %tmp12067 = getelementptr inbounds float* %tmp12066, i64 1
+  %tmp12068 = getelementptr inbounds float* %tmp12067, i64 1
+  %tmp12069 = getelementptr inbounds float* %tmp12068, i64 1
+  %tmp12070 = getelementptr inbounds float* %tmp12069, i64 1
+  %tmp12071 = getelementptr inbounds float* %tmp12070, i64 1
+  %tmp12072 = getelementptr inbounds float* %tmp12071, i64 1
+  %tmp12073 = getelementptr inbounds float* %tmp12072, i64 1
+  %tmp12074 = getelementptr inbounds float* %tmp12073, i64 1
+  %tmp12075 = getelementptr inbounds float* %tmp12074, i64 1
+  %tmp12076 = getelementptr inbounds float* %tmp12075, i64 1
+  %tmp12077 = getelementptr inbounds float* %tmp12076, i64 1
+  %tmp12078 = getelementptr inbounds float* %tmp12077, i64 1
+  %tmp12079 = getelementptr inbounds float* %tmp12078, i64 1
+  %tmp12080 = getelementptr inbounds float* %tmp12079, i64 1
+  %tmp12081 = getelementptr inbounds float* %tmp12080, i64 1
+  %tmp12082 = getelementptr inbounds float* %tmp12081, i64 1
+  %tmp12083 = getelementptr inbounds float* %tmp12082, i64 1
+  %tmp12084 = getelementptr inbounds float* %tmp12083, i64 1
+  %tmp12085 = getelementptr inbounds float* %tmp12084, i64 1
+  %tmp12086 = getelementptr inbounds float* %tmp12085, i64 1
+  %tmp12087 = getelementptr inbounds float* %tmp12086, i64 1
+  %tmp12088 = getelementptr inbounds float* %tmp12087, i64 1
+  %tmp12089 = getelementptr inbounds float* %tmp12088, i64 1
+  %tmp12090 = getelementptr inbounds float* %tmp12089, i64 1
+  %tmp12091 = getelementptr inbounds float* %tmp12090, i64 1
+  %tmp12092 = getelementptr inbounds float* %tmp12091, i64 1
+  %tmp12093 = getelementptr inbounds float* %tmp12092, i64 1
+  %tmp12094 = getelementptr inbounds float* %tmp12093, i64 1
+  %tmp12095 = getelementptr inbounds float* %tmp12094, i64 1
+  %tmp12096 = getelementptr inbounds float* %tmp12095, i64 1
+  %tmp12097 = getelementptr inbounds float* %tmp12096, i64 1
+  %tmp12098 = getelementptr inbounds float* %tmp12097, i64 1
+  %tmp12099 = getelementptr inbounds float* %tmp12098, i64 1
+  %tmp12100 = getelementptr inbounds float* %tmp12099, i64 1
+  %tmp12101 = getelementptr inbounds float* %tmp12100, i64 1
+  %tmp12102 = getelementptr inbounds float* %tmp12101, i64 1
+  %tmp12103 = getelementptr inbounds float* %tmp12102, i64 1
+  %tmp12104 = getelementptr inbounds float* %tmp12103, i64 1
+  %tmp12105 = getelementptr inbounds float* %tmp12104, i64 1
+  %tmp12106 = getelementptr inbounds float* %tmp12105, i64 1
+  %tmp12107 = getelementptr inbounds float* %tmp12106, i64 1
+  %tmp12108 = getelementptr inbounds float* %tmp12107, i64 1
+  %tmp12109 = getelementptr inbounds float* %tmp12108, i64 1
+  %tmp12110 = getelementptr inbounds float* %tmp12109, i64 1
+  %tmp12111 = getelementptr inbounds float* %tmp12110, i64 1
+  %tmp12112 = getelementptr inbounds float* %tmp12111, i64 1
+  %tmp12113 = getelementptr inbounds float* %tmp12112, i64 1
+  %tmp12114 = getelementptr inbounds float* %tmp12113, i64 1
+  %tmp12115 = getelementptr inbounds float* %tmp12114, i64 1
+  %tmp12116 = getelementptr inbounds float* %tmp12115, i64 1
+  %tmp12117 = getelementptr inbounds float* %tmp12116, i64 1
+  %tmp12118 = getelementptr inbounds float* %tmp12117, i64 1
+  %tmp12119 = getelementptr inbounds float* %tmp12118, i64 1
+  %tmp12120 = getelementptr inbounds float* %tmp12119, i64 1
+  %tmp12121 = getelementptr inbounds float* %tmp12120, i64 1
+  %tmp12122 = getelementptr inbounds float* %tmp12121, i64 1
+  %tmp12123 = getelementptr inbounds float* %tmp12122, i64 1
+  %tmp12124 = getelementptr inbounds float* %tmp12123, i64 1
+  %tmp12125 = getelementptr inbounds float* %tmp12124, i64 1
+  %tmp12126 = getelementptr inbounds float* %tmp12125, i64 1
+  %tmp12127 = getelementptr inbounds float* %tmp12126, i64 1
+  %tmp12128 = getelementptr inbounds float* %tmp12127, i64 1
+  %tmp12129 = getelementptr inbounds float* %tmp12128, i64 1
+  %tmp12130 = getelementptr inbounds float* %tmp12129, i64 1
+  %tmp12131 = getelementptr inbounds float* %tmp12130, i64 1
+  %tmp12132 = getelementptr inbounds float* %tmp12131, i64 1
+  %tmp12133 = getelementptr inbounds float* %tmp12132, i64 1
+  %tmp12134 = getelementptr inbounds float* %tmp12133, i64 1
+  %tmp12135 = getelementptr inbounds float* %tmp12134, i64 1
+  %tmp12136 = getelementptr inbounds float* %tmp12135, i64 1
+  %tmp12137 = getelementptr inbounds float* %tmp12136, i64 1
+  %tmp12138 = getelementptr inbounds float* %tmp12137, i64 1
+  %tmp12139 = getelementptr inbounds float* %tmp12138, i64 1
+  %tmp12140 = getelementptr inbounds float* %tmp12139, i64 1
+  %tmp12141 = getelementptr inbounds float* %tmp12140, i64 1
+  %tmp12142 = getelementptr inbounds float* %tmp12141, i64 1
+  %tmp12143 = getelementptr inbounds float* %tmp12142, i64 1
+  %tmp12144 = getelementptr inbounds float* %tmp12143, i64 1
+  %tmp12145 = getelementptr inbounds float* %tmp12144, i64 1
+  %tmp12146 = getelementptr inbounds float* %tmp12145, i64 1
+  %tmp12147 = getelementptr inbounds float* %tmp12146, i64 1
+  %tmp12148 = getelementptr inbounds float* %tmp12147, i64 1
+  %tmp12149 = getelementptr inbounds float* %tmp12148, i64 1
+  %tmp12150 = getelementptr inbounds float* %tmp12149, i64 1
+  %tmp12151 = getelementptr inbounds float* %tmp12150, i64 1
+  %tmp12152 = getelementptr inbounds float* %tmp12151, i64 1
+  %tmp12153 = getelementptr inbounds float* %tmp12152, i64 1
+  %tmp12154 = getelementptr inbounds float* %tmp12153, i64 1
+  %tmp12155 = getelementptr inbounds float* %tmp12154, i64 1
+  %tmp12156 = getelementptr inbounds float* %tmp12155, i64 1
+  %tmp12157 = getelementptr inbounds float* %tmp12156, i64 1
+  %tmp12158 = getelementptr inbounds float* %tmp12157, i64 1
+  %tmp12159 = getelementptr inbounds float* %tmp12158, i64 1
+  %tmp12160 = getelementptr inbounds float* %tmp12159, i64 1
+  %tmp12161 = getelementptr inbounds float* %tmp12160, i64 1
+  %tmp12162 = getelementptr inbounds float* %tmp12161, i64 1
+  %tmp12163 = getelementptr inbounds float* %tmp12162, i64 1
+  %tmp12164 = getelementptr inbounds float* %tmp12163, i64 1
+  %tmp12165 = getelementptr inbounds float* %tmp12164, i64 1
+  %tmp12166 = getelementptr inbounds float* %tmp12165, i64 1
+  %tmp12167 = getelementptr inbounds float* %tmp12166, i64 1
+  %tmp12168 = getelementptr inbounds float* %tmp12167, i64 1
+  %tmp12169 = getelementptr inbounds float* %tmp12168, i64 1
+  %tmp12170 = getelementptr inbounds float* %tmp12169, i64 1
+  %tmp12171 = getelementptr inbounds float* %tmp12170, i64 1
+  %tmp12172 = getelementptr inbounds float* %tmp12171, i64 1
+  %tmp12173 = getelementptr inbounds float* %tmp12172, i64 1
+  %tmp12174 = getelementptr inbounds float* %tmp12173, i64 1
+  %tmp12175 = getelementptr inbounds float* %tmp12174, i64 1
+  %tmp12176 = getelementptr inbounds float* %tmp12175, i64 1
+  %tmp12177 = getelementptr inbounds float* %tmp12176, i64 1
+  %tmp12178 = getelementptr inbounds float* %tmp12177, i64 1
+  %tmp12179 = getelementptr inbounds float* %tmp12178, i64 1
+  %tmp12180 = getelementptr inbounds float* %tmp12179, i64 1
+  %tmp12181 = getelementptr inbounds float* %tmp12180, i64 1
+  %tmp12182 = getelementptr inbounds float* %tmp12181, i64 1
+  %tmp12183 = getelementptr inbounds float* %tmp12182, i64 1
+  %tmp12184 = getelementptr inbounds float* %tmp12183, i64 1
+  %tmp12185 = getelementptr inbounds float* %tmp12184, i64 1
+  %tmp12186 = getelementptr inbounds float* %tmp12185, i64 1
+  %tmp12187 = getelementptr inbounds float* %tmp12186, i64 1
+  %tmp12188 = getelementptr inbounds float* %tmp12187, i64 1
+  %tmp12189 = getelementptr inbounds float* %tmp12188, i64 1
+  %tmp12190 = getelementptr inbounds float* %tmp12189, i64 1
+  %tmp12191 = getelementptr inbounds float* %tmp12190, i64 1
+  %tmp12192 = getelementptr inbounds float* %tmp12191, i64 1
+  %tmp12193 = getelementptr inbounds float* %tmp12192, i64 1
+  %tmp12194 = getelementptr inbounds float* %tmp12193, i64 1
+  %tmp12195 = getelementptr inbounds float* %tmp12194, i64 1
+  %tmp12196 = getelementptr inbounds float* %tmp12195, i64 1
+  %tmp12197 = getelementptr inbounds float* %tmp12196, i64 1
+  %tmp12198 = getelementptr inbounds float* %tmp12197, i64 1
+  %tmp12199 = getelementptr inbounds float* %tmp12198, i64 1
+  %tmp12200 = getelementptr inbounds float* %tmp12199, i64 1
+  %tmp12201 = getelementptr inbounds float* %tmp12200, i64 1
+  %tmp12202 = getelementptr inbounds float* %tmp12201, i64 1
+  %tmp12203 = getelementptr inbounds float* %tmp12202, i64 1
+  %tmp12204 = getelementptr inbounds float* %tmp12203, i64 1
+  %tmp12205 = getelementptr inbounds float* %tmp12204, i64 1
+  %tmp12206 = getelementptr inbounds float* %tmp12205, i64 1
+  %tmp12207 = getelementptr inbounds float* %tmp12206, i64 1
+  %tmp12208 = getelementptr inbounds float* %tmp12207, i64 1
+  %tmp12209 = getelementptr inbounds float* %tmp12208, i64 1
+  %tmp12210 = getelementptr inbounds float* %tmp12209, i64 1
+  %tmp12211 = getelementptr inbounds float* %tmp12210, i64 1
+  %tmp12212 = getelementptr inbounds float* %tmp12211, i64 1
+  %tmp12213 = getelementptr inbounds float* %tmp12212, i64 1
+  %tmp12214 = getelementptr inbounds float* %tmp12213, i64 1
+  %tmp12215 = getelementptr inbounds float* %tmp12214, i64 1
+  %tmp12216 = getelementptr inbounds float* %tmp12215, i64 1
+  %tmp12217 = getelementptr inbounds float* %tmp12216, i64 1
+  %tmp12218 = getelementptr inbounds float* %tmp12217, i64 1
+  %tmp12219 = getelementptr inbounds float* %tmp12218, i64 1
+  %tmp12220 = getelementptr inbounds float* %tmp12219, i64 1
+  %tmp12221 = getelementptr inbounds float* %tmp12220, i64 1
+  %tmp12222 = getelementptr inbounds float* %tmp12221, i64 1
+  %tmp12223 = getelementptr inbounds float* %tmp12222, i64 1
+  %tmp12224 = getelementptr inbounds float* %tmp12223, i64 1
+  %tmp12225 = getelementptr inbounds float* %tmp12224, i64 1
+  %tmp12226 = getelementptr inbounds float* %tmp12225, i64 1
+  %tmp12227 = getelementptr inbounds float* %tmp12226, i64 1
+  %tmp12228 = getelementptr inbounds float* %tmp12227, i64 1
+  %tmp12229 = getelementptr inbounds float* %tmp12228, i64 1
+  %tmp12230 = getelementptr inbounds float* %tmp12229, i64 1
+  %tmp12231 = getelementptr inbounds float* %tmp12230, i64 1
+  %tmp12232 = getelementptr inbounds float* %tmp12231, i64 1
+  %tmp12233 = getelementptr inbounds float* %tmp12232, i64 1
+  %tmp12234 = getelementptr inbounds float* %tmp12233, i64 1
+  %tmp12235 = getelementptr inbounds float* %tmp12234, i64 1
+  %tmp12236 = getelementptr inbounds float* %tmp12235, i64 1
+  %tmp12237 = getelementptr inbounds float* %tmp12236, i64 1
+  %tmp12238 = getelementptr inbounds float* %tmp12237, i64 1
+  %tmp12239 = getelementptr inbounds float* %tmp12238, i64 1
+  %tmp12240 = getelementptr inbounds float* %tmp12239, i64 1
+  %tmp12241 = getelementptr inbounds float* %tmp12240, i64 1
+  %tmp12242 = getelementptr inbounds float* %tmp12241, i64 1
+  %tmp12243 = getelementptr inbounds float* %tmp12242, i64 1
+  %tmp12244 = getelementptr inbounds float* %tmp12243, i64 1
+  %tmp12245 = getelementptr inbounds float* %tmp12244, i64 1
+  %tmp12246 = getelementptr inbounds float* %tmp12245, i64 1
+  %tmp12247 = getelementptr inbounds float* %tmp12246, i64 1
+  %tmp12248 = getelementptr inbounds float* %tmp12247, i64 1
+  %tmp12249 = getelementptr inbounds float* %tmp12248, i64 1
+  %tmp12250 = getelementptr inbounds float* %tmp12249, i64 1
+  %tmp12251 = getelementptr inbounds float* %tmp12250, i64 1
+  %tmp12252 = getelementptr inbounds float* %tmp12251, i64 1
+  %tmp12253 = getelementptr inbounds float* %tmp12252, i64 1
+  %tmp12254 = getelementptr inbounds float* %tmp12253, i64 1
+  %tmp12255 = getelementptr inbounds float* %tmp12254, i64 1
+  %tmp12256 = getelementptr inbounds float* %tmp12255, i64 1
+  %tmp12257 = getelementptr inbounds float* %tmp12256, i64 1
+  %tmp12258 = getelementptr inbounds float* %tmp12257, i64 1
+  %tmp12259 = getelementptr inbounds float* %tmp12258, i64 1
+  %tmp12260 = getelementptr inbounds float* %tmp12259, i64 1
+  %tmp12261 = getelementptr inbounds float* %tmp12260, i64 1
+  %tmp12262 = getelementptr inbounds float* %tmp12261, i64 1
+  %tmp12263 = getelementptr inbounds float* %tmp12262, i64 1
+  %tmp12264 = getelementptr inbounds float* %tmp12263, i64 1
+  %tmp12265 = getelementptr inbounds float* %tmp12264, i64 1
+  %tmp12266 = getelementptr inbounds float* %tmp12265, i64 1
+  %tmp12267 = getelementptr inbounds float* %tmp12266, i64 1
+  %tmp12268 = getelementptr inbounds float* %tmp12267, i64 1
+  %tmp12269 = getelementptr inbounds float* %tmp12268, i64 1
+  %tmp12270 = getelementptr inbounds float* %tmp12269, i64 1
+  %tmp12271 = getelementptr inbounds float* %tmp12270, i64 1
+  %tmp12272 = getelementptr inbounds float* %tmp12271, i64 1
+  %tmp12273 = getelementptr inbounds float* %tmp12272, i64 1
+  %tmp12274 = getelementptr inbounds float* %tmp12273, i64 1
+  %tmp12275 = getelementptr inbounds float* %tmp12274, i64 1
+  %tmp12276 = getelementptr inbounds float* %tmp12275, i64 1
+  %tmp12277 = getelementptr inbounds float* %tmp12276, i64 1
+  %tmp12278 = getelementptr inbounds float* %tmp12277, i64 1
+  %tmp12279 = getelementptr inbounds float* %tmp12278, i64 1
+  %tmp12280 = getelementptr inbounds float* %tmp12279, i64 1
+  %tmp12281 = getelementptr inbounds float* %tmp12280, i64 1
+  %tmp12282 = getelementptr inbounds float* %tmp12281, i64 1
+  %tmp12283 = getelementptr inbounds float* %tmp12282, i64 1
+  %tmp12284 = getelementptr inbounds float* %tmp12283, i64 1
+  %tmp12285 = getelementptr inbounds float* %tmp12284, i64 1
+  %tmp12286 = getelementptr inbounds float* %tmp12285, i64 1
+  %tmp12287 = getelementptr inbounds float* %tmp12286, i64 1
+  %tmp12288 = getelementptr inbounds float* %tmp12287, i64 1
+  %tmp12289 = getelementptr inbounds float* %tmp12288, i64 1
+  %tmp12290 = getelementptr inbounds float* %tmp12289, i64 1
+  %tmp12291 = getelementptr inbounds float* %tmp12290, i64 1
+  %tmp12292 = getelementptr inbounds float* %tmp12291, i64 1
+  %tmp12293 = getelementptr inbounds float* %tmp12292, i64 1
+  %tmp12294 = getelementptr inbounds float* %tmp12293, i64 1
+  %tmp12295 = getelementptr inbounds float* %tmp12294, i64 1
+  %tmp12296 = getelementptr inbounds float* %tmp12295, i64 1
+  %tmp12297 = getelementptr inbounds float* %tmp12296, i64 1
+  %tmp12298 = getelementptr inbounds float* %tmp12297, i64 1
+  %tmp12299 = getelementptr inbounds float* %tmp12298, i64 1
+  %tmp12300 = getelementptr inbounds float* %tmp12299, i64 1
+  %tmp12301 = getelementptr inbounds float* %tmp12300, i64 1
+  %tmp12302 = getelementptr inbounds float* %tmp12301, i64 1
+  %tmp12303 = getelementptr inbounds float* %tmp12302, i64 1
+  %tmp12304 = getelementptr inbounds float* %tmp12303, i64 1
+  %tmp12305 = getelementptr inbounds float* %tmp12304, i64 1
+  %tmp12306 = getelementptr inbounds float* %tmp12305, i64 1
+  %tmp12307 = getelementptr inbounds float* %tmp12306, i64 1
+  %tmp12308 = getelementptr inbounds float* %tmp12307, i64 1
+  %tmp12309 = getelementptr inbounds float* %tmp12308, i64 1
+  %tmp12310 = getelementptr inbounds float* %tmp12309, i64 1
+  %tmp12311 = getelementptr inbounds float* %tmp12310, i64 1
+  %tmp12312 = getelementptr inbounds float* %tmp12311, i64 1
+  %tmp12313 = getelementptr inbounds float* %tmp12312, i64 1
+  %tmp12314 = getelementptr inbounds float* %tmp12313, i64 1
+  %tmp12315 = getelementptr inbounds float* %tmp12314, i64 1
+  %tmp12316 = getelementptr inbounds float* %tmp12315, i64 1
+  %tmp12317 = getelementptr inbounds float* %tmp12316, i64 1
+  %tmp12318 = getelementptr inbounds float* %tmp12317, i64 1
+  %tmp12319 = getelementptr inbounds float* %tmp12318, i64 1
+  %tmp12320 = getelementptr inbounds float* %tmp12319, i64 1
+  %tmp12321 = getelementptr inbounds float* %tmp12320, i64 1
+  %tmp12322 = getelementptr inbounds float* %tmp12321, i64 1
+  %tmp12323 = getelementptr inbounds float* %tmp12322, i64 1
+  %tmp12324 = getelementptr inbounds float* %tmp12323, i64 1
+  %tmp12325 = getelementptr inbounds float* %tmp12324, i64 1
+  %tmp12326 = getelementptr inbounds float* %tmp12325, i64 1
+  %tmp12327 = getelementptr inbounds float* %tmp12326, i64 1
+  %tmp12328 = getelementptr inbounds float* %tmp12327, i64 1
+  %tmp12329 = getelementptr inbounds float* %tmp12328, i64 1
+  %tmp12330 = getelementptr inbounds float* %tmp12329, i64 1
+  %tmp12331 = getelementptr inbounds float* %tmp12330, i64 1
+  %tmp12332 = getelementptr inbounds float* %tmp12331, i64 1
+  %tmp12333 = getelementptr inbounds float* %tmp12332, i64 1
+  %tmp12334 = getelementptr inbounds float* %tmp12333, i64 1
+  %tmp12335 = getelementptr inbounds float* %tmp12334, i64 1
+  %tmp12336 = getelementptr inbounds float* %tmp12335, i64 1
+  %tmp12337 = getelementptr inbounds float* %tmp12336, i64 1
+  %tmp12338 = getelementptr inbounds float* %tmp12337, i64 1
+  %tmp12339 = getelementptr inbounds float* %tmp12338, i64 1
+  %tmp12340 = getelementptr inbounds float* %tmp12339, i64 1
+  %tmp12341 = getelementptr inbounds float* %tmp12340, i64 1
+  %tmp12342 = getelementptr inbounds float* %tmp12341, i64 1
+  %tmp12343 = getelementptr inbounds float* %tmp12342, i64 1
+  %tmp12344 = getelementptr inbounds float* %tmp12343, i64 1
+  %tmp12345 = getelementptr inbounds float* %tmp12344, i64 1
+  %tmp12346 = getelementptr inbounds float* %tmp12345, i64 1
+  %tmp12347 = getelementptr inbounds float* %tmp12346, i64 1
+  %tmp12348 = getelementptr inbounds float* %tmp12347, i64 1
+  %tmp12349 = getelementptr inbounds float* %tmp12348, i64 1
+  %tmp12350 = getelementptr inbounds float* %tmp12349, i64 1
+  %tmp12351 = getelementptr inbounds float* %tmp12350, i64 1
+  %tmp12352 = getelementptr inbounds float* %tmp12351, i64 1
+  %tmp12353 = getelementptr inbounds float* %tmp12352, i64 1
+  %tmp12354 = getelementptr inbounds float* %tmp12353, i64 1
+  %tmp12355 = getelementptr inbounds float* %tmp12354, i64 1
+  %tmp12356 = getelementptr inbounds float* %tmp12355, i64 1
+  %tmp12357 = getelementptr inbounds float* %tmp12356, i64 1
+  %tmp12358 = getelementptr inbounds float* %tmp12357, i64 1
+  %tmp12359 = getelementptr inbounds float* %tmp12358, i64 1
+  %tmp12360 = getelementptr inbounds float* %tmp12359, i64 1
+  %tmp12361 = getelementptr inbounds float* %tmp12360, i64 1
+  %tmp12362 = getelementptr inbounds float* %tmp12361, i64 1
+  %tmp12363 = getelementptr inbounds float* %tmp12362, i64 1
+  %tmp12364 = getelementptr inbounds float* %tmp12363, i64 1
+  %tmp12365 = getelementptr inbounds float* %tmp12364, i64 1
+  %tmp12366 = getelementptr inbounds float* %tmp12365, i64 1
+  %tmp12367 = getelementptr inbounds float* %tmp12366, i64 1
+  %tmp12368 = getelementptr inbounds float* %tmp12367, i64 1
+  %tmp12369 = getelementptr inbounds float* %tmp12368, i64 1
+  %tmp12370 = getelementptr inbounds float* %tmp12369, i64 1
+  %tmp12371 = getelementptr inbounds float* %tmp12370, i64 1
+  %tmp12372 = getelementptr inbounds float* %tmp12371, i64 1
+  %tmp12373 = getelementptr inbounds float* %tmp12372, i64 1
+  %tmp12374 = getelementptr inbounds float* %tmp12373, i64 1
+  %tmp12375 = getelementptr inbounds float* %tmp12374, i64 1
+  %tmp12376 = getelementptr inbounds float* %tmp12375, i64 1
+  %tmp12377 = getelementptr inbounds float* %tmp12376, i64 1
+  %tmp12378 = getelementptr inbounds float* %tmp12377, i64 1
+  %tmp12379 = getelementptr inbounds float* %tmp12378, i64 1
+  %tmp12380 = getelementptr inbounds float* %tmp12379, i64 1
+  %tmp12381 = getelementptr inbounds float* %tmp12380, i64 1
+  %tmp12382 = getelementptr inbounds float* %tmp12381, i64 1
+  %tmp12383 = getelementptr inbounds float* %tmp12382, i64 1
+  %tmp12384 = getelementptr inbounds float* %tmp12383, i64 1
+  %tmp12385 = getelementptr inbounds float* %tmp12384, i64 1
+  %tmp12386 = getelementptr inbounds float* %tmp12385, i64 1
+  %tmp12387 = getelementptr inbounds float* %tmp12386, i64 1
+  %tmp12388 = getelementptr inbounds float* %tmp12387, i64 1
+  %tmp12389 = getelementptr inbounds float* %tmp12388, i64 1
+  %tmp12390 = getelementptr inbounds float* %tmp12389, i64 1
+  %tmp12391 = getelementptr inbounds float* %tmp12390, i64 1
+  %tmp12392 = getelementptr inbounds float* %tmp12391, i64 1
+  %tmp12393 = getelementptr inbounds float* %tmp12392, i64 1
+  %tmp12394 = getelementptr inbounds float* %tmp12393, i64 1
+  %tmp12395 = getelementptr inbounds float* %tmp12394, i64 1
+  %tmp12396 = getelementptr inbounds float* %tmp12395, i64 1
+  %tmp12397 = getelementptr inbounds float* %tmp12396, i64 1
+  %tmp12398 = getelementptr inbounds float* %tmp12397, i64 1
+  %tmp12399 = getelementptr inbounds float* %tmp12398, i64 1
+  %tmp12400 = getelementptr inbounds float* %tmp12399, i64 1
+  %tmp12401 = getelementptr inbounds float* %tmp12400, i64 1
+  %tmp12402 = getelementptr inbounds float* %tmp12401, i64 1
+  %tmp12403 = getelementptr inbounds float* %tmp12402, i64 1
+  %tmp12404 = getelementptr inbounds float* %tmp12403, i64 1
+  %tmp12405 = getelementptr inbounds float* %tmp12404, i64 1
+  %tmp12406 = getelementptr inbounds float* %tmp12405, i64 1
+  %tmp12407 = getelementptr inbounds float* %tmp12406, i64 1
+  %tmp12408 = getelementptr inbounds float* %tmp12407, i64 1
+  %tmp12409 = getelementptr inbounds float* %tmp12408, i64 1
+  %tmp12410 = getelementptr inbounds float* %tmp12409, i64 1
+  %tmp12411 = getelementptr inbounds float* %tmp12410, i64 1
+  %tmp12412 = getelementptr inbounds float* %tmp12411, i64 1
+  %tmp12413 = getelementptr inbounds float* %tmp12412, i64 1
+  %tmp12414 = getelementptr inbounds float* %tmp12413, i64 1
+  %tmp12415 = getelementptr inbounds float* %tmp12414, i64 1
+  %tmp12416 = getelementptr inbounds float* %tmp12415, i64 1
+  %tmp12417 = getelementptr inbounds float* %tmp12416, i64 1
+  %tmp12418 = getelementptr inbounds float* %tmp12417, i64 1
+  %tmp12419 = getelementptr inbounds float* %tmp12418, i64 1
+  %tmp12420 = getelementptr inbounds float* %tmp12419, i64 1
+  %tmp12421 = getelementptr inbounds float* %tmp12420, i64 1
+  %tmp12422 = getelementptr inbounds float* %tmp12421, i64 1
+  %tmp12423 = getelementptr inbounds float* %tmp12422, i64 1
+  %tmp12424 = getelementptr inbounds float* %tmp12423, i64 1
+  %tmp12425 = getelementptr inbounds float* %tmp12424, i64 1
+  %tmp12426 = getelementptr inbounds float* %tmp12425, i64 1
+  %tmp12427 = getelementptr inbounds float* %tmp12426, i64 1
+  %tmp12428 = getelementptr inbounds float* %tmp12427, i64 1
+  %tmp12429 = getelementptr inbounds float* %tmp12428, i64 1
+  %tmp12430 = getelementptr inbounds float* %tmp12429, i64 1
+  %tmp12431 = getelementptr inbounds float* %tmp12430, i64 1
+  %tmp12432 = getelementptr inbounds float* %tmp12431, i64 1
+  %tmp12433 = getelementptr inbounds float* %tmp12432, i64 1
+  %tmp12434 = getelementptr inbounds float* %tmp12433, i64 1
+  %tmp12435 = getelementptr inbounds float* %tmp12434, i64 1
+  %tmp12436 = getelementptr inbounds float* %tmp12435, i64 1
+  %tmp12437 = getelementptr inbounds float* %tmp12436, i64 1
+  %tmp12438 = getelementptr inbounds float* %tmp12437, i64 1
+  %tmp12439 = getelementptr inbounds float* %tmp12438, i64 1
+  %tmp12440 = getelementptr inbounds float* %tmp12439, i64 1
+  %tmp12441 = getelementptr inbounds float* %tmp12440, i64 1
+  %tmp12442 = getelementptr inbounds float* %tmp12441, i64 1
+  %tmp12443 = getelementptr inbounds float* %tmp12442, i64 1
+  %tmp12444 = getelementptr inbounds float* %tmp12443, i64 1
+  %tmp12445 = getelementptr inbounds float* %tmp12444, i64 1
+  %tmp12446 = getelementptr inbounds float* %tmp12445, i64 1
+  %tmp12447 = getelementptr inbounds float* %tmp12446, i64 1
+  %tmp12448 = getelementptr inbounds float* %tmp12447, i64 1
+  %tmp12449 = getelementptr inbounds float* %tmp12448, i64 1
+  %tmp12450 = getelementptr inbounds float* %tmp12449, i64 1
+  %tmp12451 = getelementptr inbounds float* %tmp12450, i64 1
+  %tmp12452 = getelementptr inbounds float* %tmp12451, i64 1
+  %tmp12453 = getelementptr inbounds float* %tmp12452, i64 1
+  %tmp12454 = getelementptr inbounds float* %tmp12453, i64 1
+  %tmp12455 = getelementptr inbounds float* %tmp12454, i64 1
+  %tmp12456 = getelementptr inbounds float* %tmp12455, i64 1
+  %tmp12457 = getelementptr inbounds float* %tmp12456, i64 1
+  %tmp12458 = getelementptr inbounds float* %tmp12457, i64 1
+  %tmp12459 = getelementptr inbounds float* %tmp12458, i64 1
+  %tmp12460 = getelementptr inbounds float* %tmp12459, i64 1
+  %tmp12461 = getelementptr inbounds float* %tmp12460, i64 1
+  %tmp12462 = getelementptr inbounds float* %tmp12461, i64 1
+  %tmp12463 = getelementptr inbounds float* %tmp12462, i64 1
+  %tmp12464 = getelementptr inbounds float* %tmp12463, i64 1
+  %tmp12465 = getelementptr inbounds float* %tmp12464, i64 1
+  %tmp12466 = getelementptr inbounds float* %tmp12465, i64 1
+  %tmp12467 = getelementptr inbounds float* %tmp12466, i64 1
+  %tmp12468 = getelementptr inbounds float* %tmp12467, i64 1
+  %tmp12469 = getelementptr inbounds float* %tmp12468, i64 1
+  %tmp12470 = getelementptr inbounds float* %tmp12469, i64 1
+  %tmp12471 = getelementptr inbounds float* %tmp12470, i64 1
+  %tmp12472 = getelementptr inbounds float* %tmp12471, i64 1
+  %tmp12473 = getelementptr inbounds float* %tmp12472, i64 1
+  %tmp12474 = getelementptr inbounds float* %tmp12473, i64 1
+  %tmp12475 = getelementptr inbounds float* %tmp12474, i64 1
+  %tmp12476 = getelementptr inbounds float* %tmp12475, i64 1
+  %tmp12477 = getelementptr inbounds float* %tmp12476, i64 1
+  %tmp12478 = getelementptr inbounds float* %tmp12477, i64 1
+  %tmp12479 = getelementptr inbounds float* %tmp12478, i64 1
+  %tmp12480 = getelementptr inbounds float* %tmp12479, i64 1
+  %tmp12481 = getelementptr inbounds float* %tmp12480, i64 1
+  %tmp12482 = getelementptr inbounds float* %tmp12481, i64 1
+  %tmp12483 = getelementptr inbounds float* %tmp12482, i64 1
+  %tmp12484 = getelementptr inbounds float* %tmp12483, i64 1
+  %tmp12485 = getelementptr inbounds float* %tmp12484, i64 1
+  %tmp12486 = getelementptr inbounds float* %tmp12485, i64 1
+  %tmp12487 = getelementptr inbounds float* %tmp12486, i64 1
+  %tmp12488 = getelementptr inbounds float* %tmp12487, i64 1
+  %tmp12489 = getelementptr inbounds float* %tmp12488, i64 1
+  %tmp12490 = getelementptr inbounds float* %tmp12489, i64 1
+  %tmp12491 = getelementptr inbounds float* %tmp12490, i64 1
+  %tmp12492 = getelementptr inbounds float* %tmp12491, i64 1
+  %tmp12493 = getelementptr inbounds float* %tmp12492, i64 1
+  %tmp12494 = getelementptr inbounds float* %tmp12493, i64 1
+  %tmp12495 = getelementptr inbounds float* %tmp12494, i64 1
+  %tmp12496 = getelementptr inbounds float* %tmp12495, i64 1
+  %tmp12497 = getelementptr inbounds float* %tmp12496, i64 1
+  %tmp12498 = getelementptr inbounds float* %tmp12497, i64 1
+  %tmp12499 = getelementptr inbounds float* %tmp12498, i64 1
+  %tmp12500 = getelementptr inbounds float* %tmp12499, i64 1
+  %tmp12501 = getelementptr inbounds float* %tmp12500, i64 1
+  %tmp12502 = getelementptr inbounds float* %tmp12501, i64 1
+  %tmp12503 = getelementptr inbounds float* %tmp12502, i64 1
+  %tmp12504 = getelementptr inbounds float* %tmp12503, i64 1
+  %tmp12505 = getelementptr inbounds float* %tmp12504, i64 1
+  %tmp12506 = getelementptr inbounds float* %tmp12505, i64 1
+  %tmp12507 = getelementptr inbounds float* %tmp12506, i64 1
+  %tmp12508 = getelementptr inbounds float* %tmp12507, i64 1
+  %tmp12509 = getelementptr inbounds float* %tmp12508, i64 1
+  %tmp12510 = getelementptr inbounds float* %tmp12509, i64 1
+  %tmp12511 = getelementptr inbounds float* %tmp12510, i64 1
+  %tmp12512 = getelementptr inbounds float* %tmp12511, i64 1
+  %tmp12513 = getelementptr inbounds float* %tmp12512, i64 1
+  %tmp12514 = getelementptr inbounds float* %tmp12513, i64 1
+  %tmp12515 = getelementptr inbounds float* %tmp12514, i64 1
+  %tmp12516 = getelementptr inbounds float* %tmp12515, i64 1
+  %tmp12517 = getelementptr inbounds float* %tmp12516, i64 1
+  %tmp12518 = getelementptr inbounds float* %tmp12517, i64 1
+  %tmp12519 = getelementptr inbounds float* %tmp12518, i64 1
+  %tmp12520 = getelementptr inbounds float* %tmp12519, i64 1
+  %tmp12521 = getelementptr inbounds float* %tmp12520, i64 1
+  %tmp12522 = getelementptr inbounds float* %tmp12521, i64 1
+  %tmp12523 = getelementptr inbounds float* %tmp12522, i64 1
+  %tmp12524 = getelementptr inbounds float* %tmp12523, i64 1
+  %tmp12525 = getelementptr inbounds float* %tmp12524, i64 1
+  %tmp12526 = getelementptr inbounds float* %tmp12525, i64 1
+  %tmp12527 = getelementptr inbounds float* %tmp12526, i64 1
+  %tmp12528 = getelementptr inbounds float* %tmp12527, i64 1
+  %tmp12529 = getelementptr inbounds float* %tmp12528, i64 1
+  %tmp12530 = getelementptr inbounds float* %tmp12529, i64 1
+  %tmp12531 = getelementptr inbounds float* %tmp12530, i64 1
+  %tmp12532 = getelementptr inbounds float* %tmp12531, i64 1
+  %tmp12533 = getelementptr inbounds float* %tmp12532, i64 1
+  %tmp12534 = getelementptr inbounds float* %tmp12533, i64 1
+  %tmp12535 = getelementptr inbounds float* %tmp12534, i64 1
+  %tmp12536 = getelementptr inbounds float* %tmp12535, i64 1
+  %tmp12537 = getelementptr inbounds float* %tmp12536, i64 1
+  %tmp12538 = getelementptr inbounds float* %tmp12537, i64 1
+  %tmp12539 = getelementptr inbounds float* %tmp12538, i64 1
+  %tmp12540 = getelementptr inbounds float* %tmp12539, i64 1
+  %tmp12541 = getelementptr inbounds float* %tmp12540, i64 1
+  %tmp12542 = getelementptr inbounds float* %tmp12541, i64 1
+  %tmp12543 = getelementptr inbounds float* %tmp12542, i64 1
+  %tmp12544 = getelementptr inbounds float* %tmp12543, i64 1
+  %tmp12545 = getelementptr inbounds float* %tmp12544, i64 1
+  %tmp12546 = getelementptr inbounds float* %tmp12545, i64 1
+  %tmp12547 = getelementptr inbounds float* %tmp12546, i64 1
+  %tmp12548 = getelementptr inbounds float* %tmp12547, i64 1
+  %tmp12549 = getelementptr inbounds float* %tmp12548, i64 1
+  %tmp12550 = getelementptr inbounds float* %tmp12549, i64 1
+  %tmp12551 = getelementptr inbounds float* %tmp12550, i64 1
+  %tmp12552 = getelementptr inbounds float* %tmp12551, i64 1
+  %tmp12553 = getelementptr inbounds float* %tmp12552, i64 1
+  %tmp12554 = getelementptr inbounds float* %tmp12553, i64 1
+  %tmp12555 = getelementptr inbounds float* %tmp12554, i64 1
+  %tmp12556 = getelementptr inbounds float* %tmp12555, i64 1
+  %tmp12557 = getelementptr inbounds float* %tmp12556, i64 1
+  %tmp12558 = getelementptr inbounds float* %tmp12557, i64 1
+  %tmp12559 = getelementptr inbounds float* %tmp12558, i64 1
+  %tmp12560 = getelementptr inbounds float* %tmp12559, i64 1
+  %tmp12561 = getelementptr inbounds float* %tmp12560, i64 1
+  %tmp12562 = getelementptr inbounds float* %tmp12561, i64 1
+  %tmp12563 = getelementptr inbounds float* %tmp12562, i64 1
+  %tmp12564 = getelementptr inbounds float* %tmp12563, i64 1
+  %tmp12565 = getelementptr inbounds float* %tmp12564, i64 1
+  %tmp12566 = getelementptr inbounds float* %tmp12565, i64 1
+  %tmp12567 = getelementptr inbounds float* %tmp12566, i64 1
+  %tmp12568 = getelementptr inbounds float* %tmp12567, i64 1
+  %tmp12569 = getelementptr inbounds float* %tmp12568, i64 1
+  %tmp12570 = getelementptr inbounds float* %tmp12569, i64 1
+  %tmp12571 = getelementptr inbounds float* %tmp12570, i64 1
+  %tmp12572 = getelementptr inbounds float* %tmp12571, i64 1
+  %tmp12573 = getelementptr inbounds float* %tmp12572, i64 1
+  %tmp12574 = getelementptr inbounds float* %tmp12573, i64 1
+  %tmp12575 = getelementptr inbounds float* %tmp12574, i64 1
+  %tmp12576 = getelementptr inbounds float* %tmp12575, i64 1
+  %tmp12577 = getelementptr inbounds float* %tmp12576, i64 1
+  %tmp12578 = getelementptr inbounds float* %tmp12577, i64 1
+  %tmp12579 = getelementptr inbounds float* %tmp12578, i64 1
+  %tmp12580 = getelementptr inbounds float* %tmp12579, i64 1
+  %tmp12581 = getelementptr inbounds float* %tmp12580, i64 1
+  %tmp12582 = getelementptr inbounds float* %tmp12581, i64 1
+  %tmp12583 = getelementptr inbounds float* %tmp12582, i64 1
+  %tmp12584 = getelementptr inbounds float* %tmp12583, i64 1
+  %tmp12585 = getelementptr inbounds float* %tmp12584, i64 1
+  %tmp12586 = getelementptr inbounds float* %tmp12585, i64 1
+  %tmp12587 = getelementptr inbounds float* %tmp12586, i64 1
+  %tmp12588 = getelementptr inbounds float* %tmp12587, i64 1
+  %tmp12589 = getelementptr inbounds float* %tmp12588, i64 1
+  %tmp12590 = getelementptr inbounds float* %tmp12589, i64 1
+  %tmp12591 = getelementptr inbounds float* %tmp12590, i64 1
+  %tmp12592 = getelementptr inbounds float* %tmp12591, i64 1
+  %tmp12593 = getelementptr inbounds float* %tmp12592, i64 1
+  %tmp12594 = getelementptr inbounds float* %tmp12593, i64 1
+  %tmp12595 = getelementptr inbounds float* %tmp12594, i64 1
+  %tmp12596 = getelementptr inbounds float* %tmp12595, i64 1
+  %tmp12597 = getelementptr inbounds float* %tmp12596, i64 1
+  %tmp12598 = getelementptr inbounds float* %tmp12597, i64 1
+  %tmp12599 = getelementptr inbounds float* %tmp12598, i64 1
+  %tmp12600 = getelementptr inbounds float* %tmp12599, i64 1
+  %tmp12601 = getelementptr inbounds float* %tmp12600, i64 1
+  %tmp12602 = getelementptr inbounds float* %tmp12601, i64 1
+  %tmp12603 = getelementptr inbounds float* %tmp12602, i64 1
+  %tmp12604 = getelementptr inbounds float* %tmp12603, i64 1
+  %tmp12605 = getelementptr inbounds float* %tmp12604, i64 1
+  %tmp12606 = getelementptr inbounds float* %tmp12605, i64 1
+  %tmp12607 = getelementptr inbounds float* %tmp12606, i64 1
+  %tmp12608 = getelementptr inbounds float* %tmp12607, i64 1
+  %tmp12609 = getelementptr inbounds float* %tmp12608, i64 1
+  %tmp12610 = getelementptr inbounds float* %tmp12609, i64 1
+  %tmp12611 = getelementptr inbounds float* %tmp12610, i64 1
+  %tmp12612 = getelementptr inbounds float* %tmp12611, i64 1
+  %tmp12613 = getelementptr inbounds float* %tmp12612, i64 1
+  %tmp12614 = getelementptr inbounds float* %tmp12613, i64 1
+  %tmp12615 = getelementptr inbounds float* %tmp12614, i64 1
+  %tmp12616 = getelementptr inbounds float* %tmp12615, i64 1
+  %tmp12617 = getelementptr inbounds float* %tmp12616, i64 1
+  %tmp12618 = getelementptr inbounds float* %tmp12617, i64 1
+  %tmp12619 = getelementptr inbounds float* %tmp12618, i64 1
+  %tmp12620 = getelementptr inbounds float* %tmp12619, i64 1
+  %tmp12621 = getelementptr inbounds float* %tmp12620, i64 1
+  %tmp12622 = getelementptr inbounds float* %tmp12621, i64 1
+  %tmp12623 = getelementptr inbounds float* %tmp12622, i64 1
+  %tmp12624 = getelementptr inbounds float* %tmp12623, i64 1
+  %tmp12625 = getelementptr inbounds float* %tmp12624, i64 1
+  %tmp12626 = getelementptr inbounds float* %tmp12625, i64 1
+  %tmp12627 = getelementptr inbounds float* %tmp12626, i64 1
+  %tmp12628 = getelementptr inbounds float* %tmp12627, i64 1
+  %tmp12629 = getelementptr inbounds float* %tmp12628, i64 1
+  %tmp12630 = getelementptr inbounds float* %tmp12629, i64 1
+  %tmp12631 = getelementptr inbounds float* %tmp12630, i64 1
+  %tmp12632 = getelementptr inbounds float* %tmp12631, i64 1
+  %tmp12633 = getelementptr inbounds float* %tmp12632, i64 1
+  %tmp12634 = getelementptr inbounds float* %tmp12633, i64 1
+  %tmp12635 = getelementptr inbounds float* %tmp12634, i64 1
+  %tmp12636 = getelementptr inbounds float* %tmp12635, i64 1
+  %tmp12637 = getelementptr inbounds float* %tmp12636, i64 1
+  %tmp12638 = getelementptr inbounds float* %tmp12637, i64 1
+  %tmp12639 = getelementptr inbounds float* %tmp12638, i64 1
+  %tmp12640 = getelementptr inbounds float* %tmp12639, i64 1
+  %tmp12641 = getelementptr inbounds float* %tmp12640, i64 1
+  %tmp12642 = getelementptr inbounds float* %tmp12641, i64 1
+  %tmp12643 = getelementptr inbounds float* %tmp12642, i64 1
+  %tmp12644 = getelementptr inbounds float* %tmp12643, i64 1
+  %tmp12645 = getelementptr inbounds float* %tmp12644, i64 1
+  %tmp12646 = getelementptr inbounds float* %tmp12645, i64 1
+  %tmp12647 = getelementptr inbounds float* %tmp12646, i64 1
+  %tmp12648 = getelementptr inbounds float* %tmp12647, i64 1
+  %tmp12649 = getelementptr inbounds float* %tmp12648, i64 1
+  %tmp12650 = getelementptr inbounds float* %tmp12649, i64 1
+  %tmp12651 = getelementptr inbounds float* %tmp12650, i64 1
+  %tmp12652 = getelementptr inbounds float* %tmp12651, i64 1
+  %tmp12653 = getelementptr inbounds float* %tmp12652, i64 1
+  %tmp12654 = getelementptr inbounds float* %tmp12653, i64 1
+  %tmp12655 = getelementptr inbounds float* %tmp12654, i64 1
+  %tmp12656 = getelementptr inbounds float* %tmp12655, i64 1
+  %tmp12657 = getelementptr inbounds float* %tmp12656, i64 1
+  %tmp12658 = getelementptr inbounds float* %tmp12657, i64 1
+  %tmp12659 = getelementptr inbounds float* %tmp12658, i64 1
+  %tmp12660 = getelementptr inbounds float* %tmp12659, i64 1
+  %tmp12661 = getelementptr inbounds float* %tmp12660, i64 1
+  %tmp12662 = getelementptr inbounds float* %tmp12661, i64 1
+  %tmp12663 = getelementptr inbounds float* %tmp12662, i64 1
+  %tmp12664 = getelementptr inbounds float* %tmp12663, i64 1
+  %tmp12665 = getelementptr inbounds float* %tmp12664, i64 1
+  %tmp12666 = getelementptr inbounds float* %tmp12665, i64 1
+  %tmp12667 = getelementptr inbounds float* %tmp12666, i64 1
+  %tmp12668 = getelementptr inbounds float* %tmp12667, i64 1
+  %tmp12669 = getelementptr inbounds float* %tmp12668, i64 1
+  %tmp12670 = getelementptr inbounds float* %tmp12669, i64 1
+  %tmp12671 = getelementptr inbounds float* %tmp12670, i64 1
+  %tmp12672 = getelementptr inbounds float* %tmp12671, i64 1
+  %tmp12673 = getelementptr inbounds float* %tmp12672, i64 1
+  %tmp12674 = getelementptr inbounds float* %tmp12673, i64 1
+  %tmp12675 = getelementptr inbounds float* %tmp12674, i64 1
+  %tmp12676 = getelementptr inbounds float* %tmp12675, i64 1
+  %tmp12677 = getelementptr inbounds float* %tmp12676, i64 1
+  %tmp12678 = getelementptr inbounds float* %tmp12677, i64 1
+  %tmp12679 = getelementptr inbounds float* %tmp12678, i64 1
+  %tmp12680 = getelementptr inbounds float* %tmp12679, i64 1
+  %tmp12681 = getelementptr inbounds float* %tmp12680, i64 1
+  %tmp12682 = getelementptr inbounds float* %tmp12681, i64 1
+  %tmp12683 = getelementptr inbounds float* %tmp12682, i64 1
+  %tmp12684 = getelementptr inbounds float* %tmp12683, i64 1
+  %tmp12685 = getelementptr inbounds float* %tmp12684, i64 1
+  %tmp12686 = getelementptr inbounds float* %tmp12685, i64 1
+  %tmp12687 = getelementptr inbounds float* %tmp12686, i64 1
+  %tmp12688 = getelementptr inbounds float* %tmp12687, i64 1
+  %tmp12689 = getelementptr inbounds float* %tmp12688, i64 1
+  %tmp12690 = getelementptr inbounds float* %tmp12689, i64 1
+  %tmp12691 = getelementptr inbounds float* %tmp12690, i64 1
+  %tmp12692 = getelementptr inbounds float* %tmp12691, i64 1
+  %tmp12693 = getelementptr inbounds float* %tmp12692, i64 1
+  %tmp12694 = getelementptr inbounds float* %tmp12693, i64 1
+  %tmp12695 = getelementptr inbounds float* %tmp12694, i64 1
+  %tmp12696 = getelementptr inbounds float* %tmp12695, i64 1
+  %tmp12697 = getelementptr inbounds float* %tmp12696, i64 1
+  %tmp12698 = getelementptr inbounds float* %tmp12697, i64 1
+  %tmp12699 = getelementptr inbounds float* %tmp12698, i64 1
+  %tmp12700 = getelementptr inbounds float* %tmp12699, i64 1
+  %tmp12701 = getelementptr inbounds float* %tmp12700, i64 1
+  %tmp12702 = getelementptr inbounds float* %tmp12701, i64 1
+  %tmp12703 = getelementptr inbounds float* %tmp12702, i64 1
+  %tmp12704 = getelementptr inbounds float* %tmp12703, i64 1
+  %tmp12705 = getelementptr inbounds float* %tmp12704, i64 1
+  %tmp12706 = getelementptr inbounds float* %tmp12705, i64 1
+  %tmp12707 = getelementptr inbounds float* %tmp12706, i64 1
+  %tmp12708 = getelementptr inbounds float* %tmp12707, i64 1
+  %tmp12709 = getelementptr inbounds float* %tmp12708, i64 1
+  %tmp12710 = getelementptr inbounds float* %tmp12709, i64 1
+  %tmp12711 = getelementptr inbounds float* %tmp12710, i64 1
+  %tmp12712 = getelementptr inbounds float* %tmp12711, i64 1
+  %tmp12713 = getelementptr inbounds float* %tmp12712, i64 1
+  %tmp12714 = getelementptr inbounds float* %tmp12713, i64 1
+  %tmp12715 = getelementptr inbounds float* %tmp12714, i64 1
+  %tmp12716 = getelementptr inbounds float* %tmp12715, i64 1
+  %tmp12717 = getelementptr inbounds float* %tmp12716, i64 1
+  %tmp12718 = getelementptr inbounds float* %tmp12717, i64 1
+  %tmp12719 = getelementptr inbounds float* %tmp12718, i64 1
+  %tmp12720 = getelementptr inbounds float* %tmp12719, i64 1
+  %tmp12721 = getelementptr inbounds float* %tmp12720, i64 1
+  %tmp12722 = getelementptr inbounds float* %tmp12721, i64 1
+  %tmp12723 = getelementptr inbounds float* %tmp12722, i64 1
+  %tmp12724 = getelementptr inbounds float* %tmp12723, i64 1
+  %tmp12725 = getelementptr inbounds float* %tmp12724, i64 1
+  %tmp12726 = getelementptr inbounds float* %tmp12725, i64 1
+  %tmp12727 = getelementptr inbounds float* %tmp12726, i64 1
+  %tmp12728 = getelementptr inbounds float* %tmp12727, i64 1
+  %tmp12729 = getelementptr inbounds float* %tmp12728, i64 1
+  %tmp12730 = getelementptr inbounds float* %tmp12729, i64 1
+  %tmp12731 = getelementptr inbounds float* %tmp12730, i64 1
+  %tmp12732 = getelementptr inbounds float* %tmp12731, i64 1
+  %tmp12733 = getelementptr inbounds float* %tmp12732, i64 1
+  %tmp12734 = getelementptr inbounds float* %tmp12733, i64 1
+  %tmp12735 = getelementptr inbounds float* %tmp12734, i64 1
+  %tmp12736 = getelementptr inbounds float* %tmp12735, i64 1
+  %tmp12737 = getelementptr inbounds float* %tmp12736, i64 1
+  %tmp12738 = getelementptr inbounds float* %tmp12737, i64 1
+  %tmp12739 = getelementptr inbounds float* %tmp12738, i64 1
+  %tmp12740 = getelementptr inbounds float* %tmp12739, i64 1
+  %tmp12741 = getelementptr inbounds float* %tmp12740, i64 1
+  %tmp12742 = getelementptr inbounds float* %tmp12741, i64 1
+  %tmp12743 = getelementptr inbounds float* %tmp12742, i64 1
+  %tmp12744 = getelementptr inbounds float* %tmp12743, i64 1
+  %tmp12745 = getelementptr inbounds float* %tmp12744, i64 1
+  %tmp12746 = getelementptr inbounds float* %tmp12745, i64 1
+  %tmp12747 = getelementptr inbounds float* %tmp12746, i64 1
+  %tmp12748 = getelementptr inbounds float* %tmp12747, i64 1
+  %tmp12749 = getelementptr inbounds float* %tmp12748, i64 1
+  %tmp12750 = getelementptr inbounds float* %tmp12749, i64 1
+  %tmp12751 = getelementptr inbounds float* %tmp12750, i64 1
+  %tmp12752 = getelementptr inbounds float* %tmp12751, i64 1
+  %tmp12753 = getelementptr inbounds float* %tmp12752, i64 1
+  %tmp12754 = getelementptr inbounds float* %tmp12753, i64 1
+  %tmp12755 = getelementptr inbounds float* %tmp12754, i64 1
+  %tmp12756 = getelementptr inbounds float* %tmp12755, i64 1
+  %tmp12757 = getelementptr inbounds float* %tmp12756, i64 1
+  %tmp12758 = getelementptr inbounds float* %tmp12757, i64 1
+  %tmp12759 = getelementptr inbounds float* %tmp12758, i64 1
+  %tmp12760 = getelementptr inbounds float* %tmp12759, i64 1
+  %tmp12761 = getelementptr inbounds float* %tmp12760, i64 1
+  %tmp12762 = getelementptr inbounds float* %tmp12761, i64 1
+  %tmp12763 = getelementptr inbounds float* %tmp12762, i64 1
+  %tmp12764 = getelementptr inbounds float* %tmp12763, i64 1
+  %tmp12765 = getelementptr inbounds float* %tmp12764, i64 1
+  %tmp12766 = getelementptr inbounds float* %tmp12765, i64 1
+  %tmp12767 = getelementptr inbounds float* %tmp12766, i64 1
+  %tmp12768 = getelementptr inbounds float* %tmp12767, i64 1
+  %tmp12769 = getelementptr inbounds float* %tmp12768, i64 1
+  %tmp12770 = getelementptr inbounds float* %tmp12769, i64 1
+  %tmp12771 = getelementptr inbounds float* %tmp12770, i64 1
+  %tmp12772 = getelementptr inbounds float* %tmp12771, i64 1
+  %tmp12773 = getelementptr inbounds float* %tmp12772, i64 1
+  %tmp12774 = getelementptr inbounds float* %tmp12773, i64 1
+  %tmp12775 = getelementptr inbounds float* %tmp12774, i64 1
+  %tmp12776 = getelementptr inbounds float* %tmp12775, i64 1
+  %tmp12777 = getelementptr inbounds float* %tmp12776, i64 1
+  %tmp12778 = getelementptr inbounds float* %tmp12777, i64 1
+  %tmp12779 = getelementptr inbounds float* %tmp12778, i64 1
+  %tmp12780 = getelementptr inbounds float* %tmp12779, i64 1
+  %tmp12781 = getelementptr inbounds float* %tmp12780, i64 1
+  %tmp12782 = getelementptr inbounds float* %tmp12781, i64 1
+  %tmp12783 = getelementptr inbounds float* %tmp12782, i64 1
+  %tmp12784 = getelementptr inbounds float* %tmp12783, i64 1
+  %tmp12785 = getelementptr inbounds float* %tmp12784, i64 1
+  %tmp12786 = getelementptr inbounds float* %tmp12785, i64 1
+  %tmp12787 = getelementptr inbounds float* %tmp12786, i64 1
+  %tmp12788 = getelementptr inbounds float* %tmp12787, i64 1
+  %tmp12789 = getelementptr inbounds float* %tmp12788, i64 1
+  %tmp12790 = getelementptr inbounds float* %tmp12789, i64 1
+  %tmp12791 = getelementptr inbounds float* %tmp12790, i64 1
+  %tmp12792 = getelementptr inbounds float* %tmp12791, i64 1
+  %tmp12793 = getelementptr inbounds float* %tmp12792, i64 1
+  %tmp12794 = getelementptr inbounds float* %tmp12793, i64 1
+  %tmp12795 = getelementptr inbounds float* %tmp12794, i64 1
+  %tmp12796 = getelementptr inbounds float* %tmp12795, i64 1
+  %tmp12797 = getelementptr inbounds float* %tmp12796, i64 1
+  %tmp12798 = getelementptr inbounds float* %tmp12797, i64 1
+  %tmp12799 = getelementptr inbounds float* %tmp12798, i64 1
+  %tmp12800 = getelementptr inbounds float* %tmp12799, i64 1
+  %tmp12801 = getelementptr inbounds float* %tmp12800, i64 1
+  %tmp12802 = getelementptr inbounds float* %tmp12801, i64 1
+  %tmp12803 = getelementptr inbounds float* %tmp12802, i64 1
+  %tmp12804 = getelementptr inbounds float* %tmp12803, i64 1
+  %tmp12805 = getelementptr inbounds float* %tmp12804, i64 1
+  %tmp12806 = getelementptr inbounds float* %tmp12805, i64 1
+  %tmp12807 = getelementptr inbounds float* %tmp12806, i64 1
+  %tmp12808 = getelementptr inbounds float* %tmp12807, i64 1
+  %tmp12809 = getelementptr inbounds float* %tmp12808, i64 1
+  %tmp12810 = getelementptr inbounds float* %tmp12809, i64 1
+  %tmp12811 = getelementptr inbounds float* %tmp12810, i64 1
+  %tmp12812 = getelementptr inbounds float* %tmp12811, i64 1
+  %tmp12813 = getelementptr inbounds float* %tmp12812, i64 1
+  %tmp12814 = getelementptr inbounds float* %tmp12813, i64 1
+  %tmp12815 = getelementptr inbounds float* %tmp12814, i64 1
+  %tmp12816 = getelementptr inbounds float* %tmp12815, i64 1
+  %tmp12817 = getelementptr inbounds float* %tmp12816, i64 1
+  %tmp12818 = getelementptr inbounds float* %tmp12817, i64 1
+  %tmp12819 = getelementptr inbounds float* %tmp12818, i64 1
+  %tmp12820 = getelementptr inbounds float* %tmp12819, i64 1
+  %tmp12821 = getelementptr inbounds float* %tmp12820, i64 1
+  %tmp12822 = getelementptr inbounds float* %tmp12821, i64 1
+  %tmp12823 = getelementptr inbounds float* %tmp12822, i64 1
+  %tmp12824 = getelementptr inbounds float* %tmp12823, i64 1
+  %tmp12825 = getelementptr inbounds float* %tmp12824, i64 1
+  %tmp12826 = getelementptr inbounds float* %tmp12825, i64 1
+  %tmp12827 = getelementptr inbounds float* %tmp12826, i64 1
+  %tmp12828 = getelementptr inbounds float* %tmp12827, i64 1
+  %tmp12829 = getelementptr inbounds float* %tmp12828, i64 1
+  %tmp12830 = getelementptr inbounds float* %tmp12829, i64 1
+  %tmp12831 = getelementptr inbounds float* %tmp12830, i64 1
+  %tmp12832 = getelementptr inbounds float* %tmp12831, i64 1
+  %tmp12833 = getelementptr inbounds float* %tmp12832, i64 1
+  %tmp12834 = getelementptr inbounds float* %tmp12833, i64 1
+  %tmp12835 = getelementptr inbounds float* %tmp12834, i64 1
+  %tmp12836 = getelementptr inbounds float* %tmp12835, i64 1
+  %tmp12837 = getelementptr inbounds float* %tmp12836, i64 1
+  %tmp12838 = getelementptr inbounds float* %tmp12837, i64 1
+  %tmp12839 = getelementptr inbounds float* %tmp12838, i64 1
+  %tmp12840 = getelementptr inbounds float* %tmp12839, i64 1
+  %tmp12841 = getelementptr inbounds float* %tmp12840, i64 1
+  %tmp12842 = getelementptr inbounds float* %tmp12841, i64 1
+  %tmp12843 = getelementptr inbounds float* %tmp12842, i64 1
+  %tmp12844 = getelementptr inbounds float* %tmp12843, i64 1
+  %tmp12845 = getelementptr inbounds float* %tmp12844, i64 1
+  %tmp12846 = getelementptr inbounds float* %tmp12845, i64 1
+  %tmp12847 = getelementptr inbounds float* %tmp12846, i64 1
+  %tmp12848 = getelementptr inbounds float* %tmp12847, i64 1
+  %tmp12849 = getelementptr inbounds float* %tmp12848, i64 1
+  %tmp12850 = getelementptr inbounds float* %tmp12849, i64 1
+  %tmp12851 = getelementptr inbounds float* %tmp12850, i64 1
+  %tmp12852 = getelementptr inbounds float* %tmp12851, i64 1
+  %tmp12853 = getelementptr inbounds float* %tmp12852, i64 1
+  %tmp12854 = getelementptr inbounds float* %tmp12853, i64 1
+  %tmp12855 = getelementptr inbounds float* %tmp12854, i64 1
+  %tmp12856 = getelementptr inbounds float* %tmp12855, i64 1
+  %tmp12857 = getelementptr inbounds float* %tmp12856, i64 1
+  %tmp12858 = getelementptr inbounds float* %tmp12857, i64 1
+  %tmp12859 = getelementptr inbounds float* %tmp12858, i64 1
+  %tmp12860 = getelementptr inbounds float* %tmp12859, i64 1
+  %tmp12861 = getelementptr inbounds float* %tmp12860, i64 1
+  %tmp12862 = getelementptr inbounds float* %tmp12861, i64 1
+  %tmp12863 = getelementptr inbounds float* %tmp12862, i64 1
+  %tmp12864 = getelementptr inbounds float* %tmp12863, i64 1
+  %tmp12865 = getelementptr inbounds float* %tmp12864, i64 1
+  %tmp12866 = getelementptr inbounds float* %tmp12865, i64 1
+  %tmp12867 = getelementptr inbounds float* %tmp12866, i64 1
+  %tmp12868 = getelementptr inbounds float* %tmp12867, i64 1
+  %tmp12869 = getelementptr inbounds float* %tmp12868, i64 1
+  %tmp12870 = getelementptr inbounds float* %tmp12869, i64 1
+  %tmp12871 = getelementptr inbounds float* %tmp12870, i64 1
+  %tmp12872 = getelementptr inbounds float* %tmp12871, i64 1
+  %tmp12873 = getelementptr inbounds float* %tmp12872, i64 1
+  %tmp12874 = getelementptr inbounds float* %tmp12873, i64 1
+  %tmp12875 = getelementptr inbounds float* %tmp12874, i64 1
+  %tmp12876 = getelementptr inbounds float* %tmp12875, i64 1
+  %tmp12877 = getelementptr inbounds float* %tmp12876, i64 1
+  %tmp12878 = getelementptr inbounds float* %tmp12877, i64 1
+  %tmp12879 = getelementptr inbounds float* %tmp12878, i64 1
+  %tmp12880 = getelementptr inbounds float* %tmp12879, i64 1
+  %tmp12881 = getelementptr inbounds float* %tmp12880, i64 1
+  %tmp12882 = getelementptr inbounds float* %tmp12881, i64 1
+  %tmp12883 = getelementptr inbounds float* %tmp12882, i64 1
+  %tmp12884 = getelementptr inbounds float* %tmp12883, i64 1
+  %tmp12885 = getelementptr inbounds float* %tmp12884, i64 1
+  %tmp12886 = getelementptr inbounds float* %tmp12885, i64 1
+  %tmp12887 = getelementptr inbounds float* %tmp12886, i64 1
+  %tmp12888 = getelementptr inbounds float* %tmp12887, i64 1
+  %tmp12889 = getelementptr inbounds float* %tmp12888, i64 1
+  %tmp12890 = getelementptr inbounds float* %tmp12889, i64 1
+  %tmp12891 = getelementptr inbounds float* %tmp12890, i64 1
+  %tmp12892 = getelementptr inbounds float* %tmp12891, i64 1
+  %tmp12893 = getelementptr inbounds float* %tmp12892, i64 1
+  %tmp12894 = getelementptr inbounds float* %tmp12893, i64 1
+  %tmp12895 = getelementptr inbounds float* %tmp12894, i64 1
+  %tmp12896 = getelementptr inbounds float* %tmp12895, i64 1
+  %tmp12897 = getelementptr inbounds float* %tmp12896, i64 1
+  %tmp12898 = getelementptr inbounds float* %tmp12897, i64 1
+  %tmp12899 = getelementptr inbounds float* %tmp12898, i64 1
+  %tmp12900 = getelementptr inbounds float* %tmp12899, i64 1
+  %tmp12901 = getelementptr inbounds float* %tmp12900, i64 1
+  %tmp12902 = getelementptr inbounds float* %tmp12901, i64 1
+  %tmp12903 = getelementptr inbounds float* %tmp12902, i64 1
+  %tmp12904 = getelementptr inbounds float* %tmp12903, i64 1
+  %tmp12905 = getelementptr inbounds float* %tmp12904, i64 1
+  %tmp12906 = getelementptr inbounds float* %tmp12905, i64 1
+  %tmp12907 = getelementptr inbounds float* %tmp12906, i64 1
+  %tmp12908 = getelementptr inbounds float* %tmp12907, i64 1
+  %tmp12909 = getelementptr inbounds float* %tmp12908, i64 1
+  %tmp12910 = getelementptr inbounds float* %tmp12909, i64 1
+  %tmp12911 = getelementptr inbounds float* %tmp12910, i64 1
+  %tmp12912 = getelementptr inbounds float* %tmp12911, i64 1
+  %tmp12913 = getelementptr inbounds float* %tmp12912, i64 1
+  %tmp12914 = getelementptr inbounds float* %tmp12913, i64 1
+  %tmp12915 = getelementptr inbounds float* %tmp12914, i64 1
+  %tmp12916 = getelementptr inbounds float* %tmp12915, i64 1
+  %tmp12917 = getelementptr inbounds float* %tmp12916, i64 1
+  %tmp12918 = getelementptr inbounds float* %tmp12917, i64 1
+  %tmp12919 = getelementptr inbounds float* %tmp12918, i64 1
+  %tmp12920 = getelementptr inbounds float* %tmp12919, i64 1
+  %tmp12921 = getelementptr inbounds float* %tmp12920, i64 1
+  %tmp12922 = getelementptr inbounds float* %tmp12921, i64 1
+  %tmp12923 = getelementptr inbounds float* %tmp12922, i64 1
+  %tmp12924 = getelementptr inbounds float* %tmp12923, i64 1
+  %tmp12925 = getelementptr inbounds float* %tmp12924, i64 1
+  %tmp12926 = getelementptr inbounds float* %tmp12925, i64 1
+  %tmp12927 = getelementptr inbounds float* %tmp12926, i64 1
+  %tmp12928 = getelementptr inbounds float* %tmp12927, i64 1
+  %tmp12929 = getelementptr inbounds float* %tmp12928, i64 1
+  %tmp12930 = getelementptr inbounds float* %tmp12929, i64 1
+  %tmp12931 = getelementptr inbounds float* %tmp12930, i64 1
+  %tmp12932 = getelementptr inbounds float* %tmp12931, i64 1
+  %tmp12933 = getelementptr inbounds float* %tmp12932, i64 1
+  %tmp12934 = getelementptr inbounds float* %tmp12933, i64 1
+  %tmp12935 = getelementptr inbounds float* %tmp12934, i64 1
+  %tmp12936 = getelementptr inbounds float* %tmp12935, i64 1
+  %tmp12937 = getelementptr inbounds float* %tmp12936, i64 1
+  %tmp12938 = getelementptr inbounds float* %tmp12937, i64 1
+  %tmp12939 = getelementptr inbounds float* %tmp12938, i64 1
+  %tmp12940 = getelementptr inbounds float* %tmp12939, i64 1
+  %tmp12941 = getelementptr inbounds float* %tmp12940, i64 1
+  %tmp12942 = getelementptr inbounds float* %tmp12941, i64 1
+  %tmp12943 = getelementptr inbounds float* %tmp12942, i64 1
+  %tmp12944 = getelementptr inbounds float* %tmp12943, i64 1
+  %tmp12945 = getelementptr inbounds float* %tmp12944, i64 1
+  %tmp12946 = getelementptr inbounds float* %tmp12945, i64 1
+  %tmp12947 = getelementptr inbounds float* %tmp12946, i64 1
+  %tmp12948 = getelementptr inbounds float* %tmp12947, i64 1
+  %tmp12949 = getelementptr inbounds float* %tmp12948, i64 1
+  %tmp12950 = getelementptr inbounds float* %tmp12949, i64 1
+  %tmp12951 = getelementptr inbounds float* %tmp12950, i64 1
+  %tmp12952 = getelementptr inbounds float* %tmp12951, i64 1
+  %tmp12953 = getelementptr inbounds float* %tmp12952, i64 1
+  %tmp12954 = getelementptr inbounds float* %tmp12953, i64 1
+  %tmp12955 = getelementptr inbounds float* %tmp12954, i64 1
+  %tmp12956 = getelementptr inbounds float* %tmp12955, i64 1
+  %tmp12957 = getelementptr inbounds float* %tmp12956, i64 1
+  %tmp12958 = getelementptr inbounds float* %tmp12957, i64 1
+  %tmp12959 = getelementptr inbounds float* %tmp12958, i64 1
+  %tmp12960 = getelementptr inbounds float* %tmp12959, i64 1
+  %tmp12961 = getelementptr inbounds float* %tmp12960, i64 1
+  %tmp12962 = getelementptr inbounds float* %tmp12961, i64 1
+  %tmp12963 = getelementptr inbounds float* %tmp12962, i64 1
+  %tmp12964 = getelementptr inbounds float* %tmp12963, i64 1
+  %tmp12965 = getelementptr inbounds float* %tmp12964, i64 1
+  %tmp12966 = getelementptr inbounds float* %tmp12965, i64 1
+  %tmp12967 = getelementptr inbounds float* %tmp12966, i64 1
+  %tmp12968 = getelementptr inbounds float* %tmp12967, i64 1
+  %tmp12969 = getelementptr inbounds float* %tmp12968, i64 1
+  %tmp12970 = getelementptr inbounds float* %tmp12969, i64 1
+  %tmp12971 = getelementptr inbounds float* %tmp12970, i64 1
+  %tmp12972 = getelementptr inbounds float* %tmp12971, i64 1
+  %tmp12973 = getelementptr inbounds float* %tmp12972, i64 1
+  %tmp12974 = getelementptr inbounds float* %tmp12973, i64 1
+  %tmp12975 = getelementptr inbounds float* %tmp12974, i64 1
+  %tmp12976 = getelementptr inbounds float* %tmp12975, i64 1
+  %tmp12977 = getelementptr inbounds float* %tmp12976, i64 1
+  %tmp12978 = getelementptr inbounds float* %tmp12977, i64 1
+  %tmp12979 = getelementptr inbounds float* %tmp12978, i64 1
+  %tmp12980 = getelementptr inbounds float* %tmp12979, i64 1
+  %tmp12981 = getelementptr inbounds float* %tmp12980, i64 1
+  %tmp12982 = getelementptr inbounds float* %tmp12981, i64 1
+  %tmp12983 = getelementptr inbounds float* %tmp12982, i64 1
+  %tmp12984 = getelementptr inbounds float* %tmp12983, i64 1
+  %tmp12985 = getelementptr inbounds float* %tmp12984, i64 1
+  %tmp12986 = getelementptr inbounds float* %tmp12985, i64 1
+  %tmp12987 = getelementptr inbounds float* %tmp12986, i64 1
+  %tmp12988 = getelementptr inbounds float* %tmp12987, i64 1
+  %tmp12989 = getelementptr inbounds float* %tmp12988, i64 1
+  %tmp12990 = getelementptr inbounds float* %tmp12989, i64 1
+  %tmp12991 = getelementptr inbounds float* %tmp12990, i64 1
+  %tmp12992 = getelementptr inbounds float* %tmp12991, i64 1
+  %tmp12993 = getelementptr inbounds float* %tmp12992, i64 1
+  %tmp12994 = getelementptr inbounds float* %tmp12993, i64 1
+  %tmp12995 = getelementptr inbounds float* %tmp12994, i64 1
+  %tmp12996 = getelementptr inbounds float* %tmp12995, i64 1
+  %tmp12997 = getelementptr inbounds float* %tmp12996, i64 1
+  %tmp12998 = getelementptr inbounds float* %tmp12997, i64 1
+  %tmp12999 = getelementptr inbounds float* %tmp12998, i64 1
+  %tmp13000 = getelementptr inbounds float* %tmp12999, i64 1
+  %tmp13001 = getelementptr inbounds float* %tmp13000, i64 1
+  %tmp13002 = getelementptr inbounds float* %tmp13001, i64 1
+  %tmp13003 = getelementptr inbounds float* %tmp13002, i64 1
+  %tmp13004 = getelementptr inbounds float* %tmp13003, i64 1
+  %tmp13005 = getelementptr inbounds float* %tmp13004, i64 1
+  %tmp13006 = getelementptr inbounds float* %tmp13005, i64 1
+  %tmp13007 = getelementptr inbounds float* %tmp13006, i64 1
+  %tmp13008 = getelementptr inbounds float* %tmp13007, i64 1
+  %tmp13009 = getelementptr inbounds float* %tmp13008, i64 1
+  %tmp13010 = getelementptr inbounds float* %tmp13009, i64 1
+  %tmp13011 = getelementptr inbounds float* %tmp13010, i64 1
+  %tmp13012 = getelementptr inbounds float* %tmp13011, i64 1
+  %tmp13013 = getelementptr inbounds float* %tmp13012, i64 1
+  %tmp13014 = getelementptr inbounds float* %tmp13013, i64 1
+  %tmp13015 = getelementptr inbounds float* %tmp13014, i64 1
+  %tmp13016 = getelementptr inbounds float* %tmp13015, i64 1
+  %tmp13017 = getelementptr inbounds float* %tmp13016, i64 1
+  %tmp13018 = getelementptr inbounds float* %tmp13017, i64 1
+  %tmp13019 = getelementptr inbounds float* %tmp13018, i64 1
+  %tmp13020 = getelementptr inbounds float* %tmp13019, i64 1
+  %tmp13021 = getelementptr inbounds float* %tmp13020, i64 1
+  %tmp13022 = getelementptr inbounds float* %tmp13021, i64 1
+  %tmp13023 = getelementptr inbounds float* %tmp13022, i64 1
+  %tmp13024 = getelementptr inbounds float* %tmp13023, i64 1
+  %tmp13025 = getelementptr inbounds float* %tmp13024, i64 1
+  %tmp13026 = getelementptr inbounds float* %tmp13025, i64 1
+  %tmp13027 = getelementptr inbounds float* %tmp13026, i64 1
+  %tmp13028 = getelementptr inbounds float* %tmp13027, i64 1
+  %tmp13029 = getelementptr inbounds float* %tmp13028, i64 1
+  %tmp13030 = getelementptr inbounds float* %tmp13029, i64 1
+  %tmp13031 = getelementptr inbounds float* %tmp13030, i64 1
+  %tmp13032 = getelementptr inbounds float* %tmp13031, i64 1
+  %tmp13033 = getelementptr inbounds float* %tmp13032, i64 1
+  %tmp13034 = getelementptr inbounds float* %tmp13033, i64 1
+  %tmp13035 = getelementptr inbounds float* %tmp13034, i64 1
+  %tmp13036 = getelementptr inbounds float* %tmp13035, i64 1
+  %tmp13037 = getelementptr inbounds float* %tmp13036, i64 1
+  %tmp13038 = getelementptr inbounds float* %tmp13037, i64 1
+  %tmp13039 = getelementptr inbounds float* %tmp13038, i64 1
+  %tmp13040 = getelementptr inbounds float* %tmp13039, i64 1
+  %tmp13041 = getelementptr inbounds float* %tmp13040, i64 1
+  %tmp13042 = getelementptr inbounds float* %tmp13041, i64 1
+  %tmp13043 = getelementptr inbounds float* %tmp13042, i64 1
+  %tmp13044 = getelementptr inbounds float* %tmp13043, i64 1
+  %tmp13045 = getelementptr inbounds float* %tmp13044, i64 1
+  %tmp13046 = getelementptr inbounds float* %tmp13045, i64 1
+  %tmp13047 = getelementptr inbounds float* %tmp13046, i64 1
+  %tmp13048 = getelementptr inbounds float* %tmp13047, i64 1
+  %tmp13049 = getelementptr inbounds float* %tmp13048, i64 1
+  %tmp13050 = getelementptr inbounds float* %tmp13049, i64 1
+  %tmp13051 = getelementptr inbounds float* %tmp13050, i64 1
+  %tmp13052 = getelementptr inbounds float* %tmp13051, i64 1
+  %tmp13053 = getelementptr inbounds float* %tmp13052, i64 1
+  %tmp13054 = getelementptr inbounds float* %tmp13053, i64 1
+  %tmp13055 = getelementptr inbounds float* %tmp13054, i64 1
+  %tmp13056 = getelementptr inbounds float* %tmp13055, i64 1
+  %tmp13057 = getelementptr inbounds float* %tmp13056, i64 1
+  %tmp13058 = getelementptr inbounds float* %tmp13057, i64 1
+  %tmp13059 = getelementptr inbounds float* %tmp13058, i64 1
+  %tmp13060 = getelementptr inbounds float* %tmp13059, i64 1
+  %tmp13061 = getelementptr inbounds float* %tmp13060, i64 1
+  %tmp13062 = getelementptr inbounds float* %tmp13061, i64 1
+  %tmp13063 = getelementptr inbounds float* %tmp13062, i64 1
+  %tmp13064 = getelementptr inbounds float* %tmp13063, i64 1
+  %tmp13065 = getelementptr inbounds float* %tmp13064, i64 1
+  %tmp13066 = getelementptr inbounds float* %tmp13065, i64 1
+  %tmp13067 = getelementptr inbounds float* %tmp13066, i64 1
+  %tmp13068 = getelementptr inbounds float* %tmp13067, i64 1
+  %tmp13069 = getelementptr inbounds float* %tmp13068, i64 1
+  %tmp13070 = getelementptr inbounds float* %tmp13069, i64 1
+  %tmp13071 = getelementptr inbounds float* %tmp13070, i64 1
+  %tmp13072 = getelementptr inbounds float* %tmp13071, i64 1
+  %tmp13073 = getelementptr inbounds float* %tmp13072, i64 1
+  %tmp13074 = getelementptr inbounds float* %tmp13073, i64 1
+  %tmp13075 = getelementptr inbounds float* %tmp13074, i64 1
+  %tmp13076 = getelementptr inbounds float* %tmp13075, i64 1
+  %tmp13077 = getelementptr inbounds float* %tmp13076, i64 1
+  %tmp13078 = getelementptr inbounds float* %tmp13077, i64 1
+  %tmp13079 = getelementptr inbounds float* %tmp13078, i64 1
+  %tmp13080 = getelementptr inbounds float* %tmp13079, i64 1
+  %tmp13081 = getelementptr inbounds float* %tmp13080, i64 1
+  %tmp13082 = getelementptr inbounds float* %tmp13081, i64 1
+  %tmp13083 = getelementptr inbounds float* %tmp13082, i64 1
+  %tmp13084 = getelementptr inbounds float* %tmp13083, i64 1
+  %tmp13085 = getelementptr inbounds float* %tmp13084, i64 1
+  %tmp13086 = getelementptr inbounds float* %tmp13085, i64 1
+  %tmp13087 = getelementptr inbounds float* %tmp13086, i64 1
+  %tmp13088 = getelementptr inbounds float* %tmp13087, i64 1
+  %tmp13089 = getelementptr inbounds float* %tmp13088, i64 1
+  %tmp13090 = getelementptr inbounds float* %tmp13089, i64 1
+  %tmp13091 = getelementptr inbounds float* %tmp13090, i64 1
+  %tmp13092 = getelementptr inbounds float* %tmp13091, i64 1
+  %tmp13093 = getelementptr inbounds float* %tmp13092, i64 1
+  %tmp13094 = getelementptr inbounds float* %tmp13093, i64 1
+  %tmp13095 = getelementptr inbounds float* %tmp13094, i64 1
+  %tmp13096 = getelementptr inbounds float* %tmp13095, i64 1
+  %tmp13097 = getelementptr inbounds float* %tmp13096, i64 1
+  %tmp13098 = getelementptr inbounds float* %tmp13097, i64 1
+  %tmp13099 = getelementptr inbounds float* %tmp13098, i64 1
+  %tmp13100 = getelementptr inbounds float* %tmp13099, i64 1
+  %tmp13101 = getelementptr inbounds float* %tmp13100, i64 1
+  %tmp13102 = getelementptr inbounds float* %tmp13101, i64 1
+  %tmp13103 = getelementptr inbounds float* %tmp13102, i64 1
+  %tmp13104 = getelementptr inbounds float* %tmp13103, i64 1
+  %tmp13105 = getelementptr inbounds float* %tmp13104, i64 1
+  %tmp13106 = getelementptr inbounds float* %tmp13105, i64 1
+  %tmp13107 = getelementptr inbounds float* %tmp13106, i64 1
+  %tmp13108 = getelementptr inbounds float* %tmp13107, i64 1
+  %tmp13109 = getelementptr inbounds float* %tmp13108, i64 1
+  %tmp13110 = getelementptr inbounds float* %tmp13109, i64 1
+  %tmp13111 = getelementptr inbounds float* %tmp13110, i64 1
+  %tmp13112 = getelementptr inbounds float* %tmp13111, i64 1
+  %tmp13113 = getelementptr inbounds float* %tmp13112, i64 1
+  %tmp13114 = getelementptr inbounds float* %tmp13113, i64 1
+  %tmp13115 = getelementptr inbounds float* %tmp13114, i64 1
+  %tmp13116 = getelementptr inbounds float* %tmp13115, i64 1
+  %tmp13117 = getelementptr inbounds float* %tmp13116, i64 1
+  %tmp13118 = getelementptr inbounds float* %tmp13117, i64 1
+  %tmp13119 = getelementptr inbounds float* %tmp13118, i64 1
+  %tmp13120 = getelementptr inbounds float* %tmp13119, i64 1
+  %tmp13121 = getelementptr inbounds float* %tmp13120, i64 1
+  %tmp13122 = getelementptr inbounds float* %tmp13121, i64 1
+  %tmp13123 = getelementptr inbounds float* %tmp13122, i64 1
+  %tmp13124 = getelementptr inbounds float* %tmp13123, i64 1
+  %tmp13125 = getelementptr inbounds float* %tmp13124, i64 1
+  %tmp13126 = getelementptr inbounds float* %tmp13125, i64 1
+  %tmp13127 = getelementptr inbounds float* %tmp13126, i64 1
+  %tmp13128 = getelementptr inbounds float* %tmp13127, i64 1
+  %tmp13129 = getelementptr inbounds float* %tmp13128, i64 1
+  %tmp13130 = getelementptr inbounds float* %tmp13129, i64 1
+  %tmp13131 = getelementptr inbounds float* %tmp13130, i64 1
+  %tmp13132 = getelementptr inbounds float* %tmp13131, i64 1
+  %tmp13133 = getelementptr inbounds float* %tmp13132, i64 1
+  %tmp13134 = getelementptr inbounds float* %tmp13133, i64 1
+  %tmp13135 = getelementptr inbounds float* %tmp13134, i64 1
+  %tmp13136 = getelementptr inbounds float* %tmp13135, i64 1
+  %tmp13137 = getelementptr inbounds float* %tmp13136, i64 1
+  %tmp13138 = getelementptr inbounds float* %tmp13137, i64 1
+  %tmp13139 = getelementptr inbounds float* %tmp13138, i64 1
+  %tmp13140 = getelementptr inbounds float* %tmp13139, i64 1
+  %tmp13141 = getelementptr inbounds float* %tmp13140, i64 1
+  %tmp13142 = getelementptr inbounds float* %tmp13141, i64 1
+  %tmp13143 = getelementptr inbounds float* %tmp13142, i64 1
+  %tmp13144 = getelementptr inbounds float* %tmp13143, i64 1
+  %tmp13145 = getelementptr inbounds float* %tmp13144, i64 1
+  %tmp13146 = getelementptr inbounds float* %tmp13145, i64 1
+  %tmp13147 = getelementptr inbounds float* %tmp13146, i64 1
+  %tmp13148 = getelementptr inbounds float* %tmp13147, i64 1
+  %tmp13149 = getelementptr inbounds float* %tmp13148, i64 1
+  %tmp13150 = getelementptr inbounds float* %tmp13149, i64 1
+  %tmp13151 = getelementptr inbounds float* %tmp13150, i64 1
+  %tmp13152 = getelementptr inbounds float* %tmp13151, i64 1
+  %tmp13153 = getelementptr inbounds float* %tmp13152, i64 1
+  %tmp13154 = getelementptr inbounds float* %tmp13153, i64 1
+  %tmp13155 = getelementptr inbounds float* %tmp13154, i64 1
+  %tmp13156 = getelementptr inbounds float* %tmp13155, i64 1
+  %tmp13157 = getelementptr inbounds float* %tmp13156, i64 1
+  %tmp13158 = getelementptr inbounds float* %tmp13157, i64 1
+  %tmp13159 = getelementptr inbounds float* %tmp13158, i64 1
+  %tmp13160 = getelementptr inbounds float* %tmp13159, i64 1
+  %tmp13161 = getelementptr inbounds float* %tmp13160, i64 1
+  %tmp13162 = getelementptr inbounds float* %tmp13161, i64 1
+  %tmp13163 = getelementptr inbounds float* %tmp13162, i64 1
+  %tmp13164 = getelementptr inbounds float* %tmp13163, i64 1
+  %tmp13165 = getelementptr inbounds float* %tmp13164, i64 1
+  %tmp13166 = getelementptr inbounds float* %tmp13165, i64 1
+  %tmp13167 = getelementptr inbounds float* %tmp13166, i64 1
+  %tmp13168 = getelementptr inbounds float* %tmp13167, i64 1
+  %tmp13169 = getelementptr inbounds float* %tmp13168, i64 1
+  %tmp13170 = getelementptr inbounds float* %tmp13169, i64 1
+  %tmp13171 = getelementptr inbounds float* %tmp13170, i64 1
+  %tmp13172 = getelementptr inbounds float* %tmp13171, i64 1
+  %tmp13173 = getelementptr inbounds float* %tmp13172, i64 1
+  %tmp13174 = getelementptr inbounds float* %tmp13173, i64 1
+  %tmp13175 = getelementptr inbounds float* %tmp13174, i64 1
+  %tmp13176 = getelementptr inbounds float* %tmp13175, i64 1
+  %tmp13177 = getelementptr inbounds float* %tmp13176, i64 1
+  %tmp13178 = getelementptr inbounds float* %tmp13177, i64 1
+  %tmp13179 = getelementptr inbounds float* %tmp13178, i64 1
+  %tmp13180 = getelementptr inbounds float* %tmp13179, i64 1
+  %tmp13181 = getelementptr inbounds float* %tmp13180, i64 1
+  %tmp13182 = getelementptr inbounds float* %tmp13181, i64 1
+  %tmp13183 = getelementptr inbounds float* %tmp13182, i64 1
+  %tmp13184 = getelementptr inbounds float* %tmp13183, i64 1
+  %tmp13185 = getelementptr inbounds float* %tmp13184, i64 1
+  %tmp13186 = getelementptr inbounds float* %tmp13185, i64 1
+  %tmp13187 = getelementptr inbounds float* %tmp13186, i64 1
+  %tmp13188 = getelementptr inbounds float* %tmp13187, i64 1
+  %tmp13189 = getelementptr inbounds float* %tmp13188, i64 1
+  %tmp13190 = getelementptr inbounds float* %tmp13189, i64 1
+  %tmp13191 = getelementptr inbounds float* %tmp13190, i64 1
+  %tmp13192 = getelementptr inbounds float* %tmp13191, i64 1
+  %tmp13193 = getelementptr inbounds float* %tmp13192, i64 1
+  %tmp13194 = getelementptr inbounds float* %tmp13193, i64 1
+  %tmp13195 = getelementptr inbounds float* %tmp13194, i64 1
+  %tmp13196 = getelementptr inbounds float* %tmp13195, i64 1
+  %tmp13197 = getelementptr inbounds float* %tmp13196, i64 1
+  %tmp13198 = getelementptr inbounds float* %tmp13197, i64 1
+  %tmp13199 = getelementptr inbounds float* %tmp13198, i64 1
+  %tmp13200 = getelementptr inbounds float* %tmp13199, i64 1
+  %tmp13201 = getelementptr inbounds float* %tmp13200, i64 1
+  %tmp13202 = getelementptr inbounds float* %tmp13201, i64 1
+  %tmp13203 = getelementptr inbounds float* %tmp13202, i64 1
+  %tmp13204 = getelementptr inbounds float* %tmp13203, i64 1
+  %tmp13205 = getelementptr inbounds float* %tmp13204, i64 1
+  %tmp13206 = getelementptr inbounds float* %tmp13205, i64 1
+  %tmp13207 = getelementptr inbounds float* %tmp13206, i64 1
+  %tmp13208 = getelementptr inbounds float* %tmp13207, i64 1
+  %tmp13209 = getelementptr inbounds float* %tmp13208, i64 1
+  %tmp13210 = getelementptr inbounds float* %tmp13209, i64 1
+  %tmp13211 = getelementptr inbounds float* %tmp13210, i64 1
+  %tmp13212 = getelementptr inbounds float* %tmp13211, i64 1
+  %tmp13213 = getelementptr inbounds float* %tmp13212, i64 1
+  %tmp13214 = getelementptr inbounds float* %tmp13213, i64 1
+  %tmp13215 = getelementptr inbounds float* %tmp13214, i64 1
+  %tmp13216 = getelementptr inbounds float* %tmp13215, i64 1
+  %tmp13217 = getelementptr inbounds float* %tmp13216, i64 1
+  %tmp13218 = getelementptr inbounds float* %tmp13217, i64 1
+  %tmp13219 = getelementptr inbounds float* %tmp13218, i64 1
+  %tmp13220 = getelementptr inbounds float* %tmp13219, i64 1
+  %tmp13221 = getelementptr inbounds float* %tmp13220, i64 1
+  %tmp13222 = getelementptr inbounds float* %tmp13221, i64 1
+  %tmp13223 = getelementptr inbounds float* %tmp13222, i64 1
+  %tmp13224 = getelementptr inbounds float* %tmp13223, i64 1
+  %tmp13225 = getelementptr inbounds float* %tmp13224, i64 1
+  %tmp13226 = getelementptr inbounds float* %tmp13225, i64 1
+  %tmp13227 = getelementptr inbounds float* %tmp13226, i64 1
+  %tmp13228 = getelementptr inbounds float* %tmp13227, i64 1
+  %tmp13229 = getelementptr inbounds float* %tmp13228, i64 1
+  %tmp13230 = getelementptr inbounds float* %tmp13229, i64 1
+  %tmp13231 = getelementptr inbounds float* %tmp13230, i64 1
+  %tmp13232 = getelementptr inbounds float* %tmp13231, i64 1
+  %tmp13233 = getelementptr inbounds float* %tmp13232, i64 1
+  %tmp13234 = getelementptr inbounds float* %tmp13233, i64 1
+  %tmp13235 = getelementptr inbounds float* %tmp13234, i64 1
+  %tmp13236 = getelementptr inbounds float* %tmp13235, i64 1
+  %tmp13237 = getelementptr inbounds float* %tmp13236, i64 1
+  %tmp13238 = getelementptr inbounds float* %tmp13237, i64 1
+  %tmp13239 = getelementptr inbounds float* %tmp13238, i64 1
+  %tmp13240 = getelementptr inbounds float* %tmp13239, i64 1
+  %tmp13241 = getelementptr inbounds float* %tmp13240, i64 1
+  %tmp13242 = getelementptr inbounds float* %tmp13241, i64 1
+  %tmp13243 = getelementptr inbounds float* %tmp13242, i64 1
+  %tmp13244 = getelementptr inbounds float* %tmp13243, i64 1
+  %tmp13245 = getelementptr inbounds float* %tmp13244, i64 1
+  %tmp13246 = getelementptr inbounds float* %tmp13245, i64 1
+  %tmp13247 = getelementptr inbounds float* %tmp13246, i64 1
+  %tmp13248 = getelementptr inbounds float* %tmp13247, i64 1
+  %tmp13249 = getelementptr inbounds float* %tmp13248, i64 1
+  %tmp13250 = getelementptr inbounds float* %tmp13249, i64 1
+  %tmp13251 = getelementptr inbounds float* %tmp13250, i64 1
+  %tmp13252 = getelementptr inbounds float* %tmp13251, i64 1
+  %tmp13253 = getelementptr inbounds float* %tmp13252, i64 1
+  %tmp13254 = getelementptr inbounds float* %tmp13253, i64 1
+  %tmp13255 = getelementptr inbounds float* %tmp13254, i64 1
+  %tmp13256 = getelementptr inbounds float* %tmp13255, i64 1
+  %tmp13257 = getelementptr inbounds float* %tmp13256, i64 1
+  %tmp13258 = getelementptr inbounds float* %tmp13257, i64 1
+  %tmp13259 = getelementptr inbounds float* %tmp13258, i64 1
+  %tmp13260 = getelementptr inbounds float* %tmp13259, i64 1
+  %tmp13261 = getelementptr inbounds float* %tmp13260, i64 1
+  %tmp13262 = getelementptr inbounds float* %tmp13261, i64 1
+  %tmp13263 = getelementptr inbounds float* %tmp13262, i64 1
+  %tmp13264 = getelementptr inbounds float* %tmp13263, i64 1
+  %tmp13265 = getelementptr inbounds float* %tmp13264, i64 1
+  %tmp13266 = getelementptr inbounds float* %tmp13265, i64 1
+  %tmp13267 = getelementptr inbounds float* %tmp13266, i64 1
+  %tmp13268 = getelementptr inbounds float* %tmp13267, i64 1
+  %tmp13269 = getelementptr inbounds float* %tmp13268, i64 1
+  %tmp13270 = getelementptr inbounds float* %tmp13269, i64 1
+  %tmp13271 = getelementptr inbounds float* %tmp13270, i64 1
+  %tmp13272 = getelementptr inbounds float* %tmp13271, i64 1
+  %tmp13273 = getelementptr inbounds float* %tmp13272, i64 1
+  %tmp13274 = getelementptr inbounds float* %tmp13273, i64 1
+  %tmp13275 = getelementptr inbounds float* %tmp13274, i64 1
+  %tmp13276 = getelementptr inbounds float* %tmp13275, i64 1
+  %tmp13277 = getelementptr inbounds float* %tmp13276, i64 1
+  %tmp13278 = getelementptr inbounds float* %tmp13277, i64 1
+  %tmp13279 = getelementptr inbounds float* %tmp13278, i64 1
+  %tmp13280 = getelementptr inbounds float* %tmp13279, i64 1
+  %tmp13281 = getelementptr inbounds float* %tmp13280, i64 1
+  %tmp13282 = getelementptr inbounds float* %tmp13281, i64 1
+  %tmp13283 = getelementptr inbounds float* %tmp13282, i64 1
+  %tmp13284 = getelementptr inbounds float* %tmp13283, i64 1
+  %tmp13285 = getelementptr inbounds float* %tmp13284, i64 1
+  %tmp13286 = getelementptr inbounds float* %tmp13285, i64 1
+  %tmp13287 = getelementptr inbounds float* %tmp13286, i64 1
+  %tmp13288 = getelementptr inbounds float* %tmp13287, i64 1
+  %tmp13289 = getelementptr inbounds float* %tmp13288, i64 1
+  %tmp13290 = getelementptr inbounds float* %tmp13289, i64 1
+  %tmp13291 = getelementptr inbounds float* %tmp13290, i64 1
+  %tmp13292 = getelementptr inbounds float* %tmp13291, i64 1
+  %tmp13293 = getelementptr inbounds float* %tmp13292, i64 1
+  %tmp13294 = getelementptr inbounds float* %tmp13293, i64 1
+  %tmp13295 = getelementptr inbounds float* %tmp13294, i64 1
+  %tmp13296 = getelementptr inbounds float* %tmp13295, i64 1
+  %tmp13297 = getelementptr inbounds float* %tmp13296, i64 1
+  %tmp13298 = getelementptr inbounds float* %tmp13297, i64 1
+  %tmp13299 = getelementptr inbounds float* %tmp13298, i64 1
+  %tmp13300 = getelementptr inbounds float* %tmp13299, i64 1
+  %tmp13301 = getelementptr inbounds float* %tmp13300, i64 1
+  %tmp13302 = getelementptr inbounds float* %tmp13301, i64 1
+  %tmp13303 = getelementptr inbounds float* %tmp13302, i64 1
+  %tmp13304 = getelementptr inbounds float* %tmp13303, i64 1
+  %tmp13305 = getelementptr inbounds float* %tmp13304, i64 1
+  %tmp13306 = getelementptr inbounds float* %tmp13305, i64 1
+  %tmp13307 = getelementptr inbounds float* %tmp13306, i64 1
+  %tmp13308 = getelementptr inbounds float* %tmp13307, i64 1
+  %tmp13309 = getelementptr inbounds float* %tmp13308, i64 1
+  %tmp13310 = getelementptr inbounds float* %tmp13309, i64 1
+  %tmp13311 = getelementptr inbounds float* %tmp13310, i64 1
+  %tmp13312 = getelementptr inbounds float* %tmp13311, i64 1
+  %tmp13313 = getelementptr inbounds float* %tmp13312, i64 1
+  %tmp13314 = getelementptr inbounds float* %tmp13313, i64 1
+  %tmp13315 = getelementptr inbounds float* %tmp13314, i64 1
+  %tmp13316 = getelementptr inbounds float* %tmp13315, i64 1
+  %tmp13317 = getelementptr inbounds float* %tmp13316, i64 1
+  %tmp13318 = getelementptr inbounds float* %tmp13317, i64 1
+  %tmp13319 = getelementptr inbounds float* %tmp13318, i64 1
+  %tmp13320 = getelementptr inbounds float* %tmp13319, i64 1
+  %tmp13321 = getelementptr inbounds float* %tmp13320, i64 1
+  %tmp13322 = getelementptr inbounds float* %tmp13321, i64 1
+  %tmp13323 = getelementptr inbounds float* %tmp13322, i64 1
+  %tmp13324 = getelementptr inbounds float* %tmp13323, i64 1
+  %tmp13325 = getelementptr inbounds float* %tmp13324, i64 1
+  %tmp13326 = getelementptr inbounds float* %tmp13325, i64 1
+  %tmp13327 = getelementptr inbounds float* %tmp13326, i64 1
+  %tmp13328 = getelementptr inbounds float* %tmp13327, i64 1
+  %tmp13329 = getelementptr inbounds float* %tmp13328, i64 1
+  %tmp13330 = getelementptr inbounds float* %tmp13329, i64 1
+  %tmp13331 = getelementptr inbounds float* %tmp13330, i64 1
+  %tmp13332 = getelementptr inbounds float* %tmp13331, i64 1
+  %tmp13333 = getelementptr inbounds float* %tmp13332, i64 1
+  %tmp13334 = getelementptr inbounds float* %tmp13333, i64 1
+  %tmp13335 = getelementptr inbounds float* %tmp13334, i64 1
+  %tmp13336 = getelementptr inbounds float* %tmp13335, i64 1
+  %tmp13337 = getelementptr inbounds float* %tmp13336, i64 1
+  %tmp13338 = getelementptr inbounds float* %tmp13337, i64 1
+  %tmp13339 = getelementptr inbounds float* %tmp13338, i64 1
+  %tmp13340 = getelementptr inbounds float* %tmp13339, i64 1
+  %tmp13341 = getelementptr inbounds float* %tmp13340, i64 1
+  %tmp13342 = getelementptr inbounds float* %tmp13341, i64 1
+  %tmp13343 = getelementptr inbounds float* %tmp13342, i64 1
+  %tmp13344 = getelementptr inbounds float* %tmp13343, i64 1
+  %tmp13345 = getelementptr inbounds float* %tmp13344, i64 1
+  %tmp13346 = getelementptr inbounds float* %tmp13345, i64 1
+  %tmp13347 = getelementptr inbounds float* %tmp13346, i64 1
+  %tmp13348 = getelementptr inbounds float* %tmp13347, i64 1
+  %tmp13349 = getelementptr inbounds float* %tmp13348, i64 1
+  %tmp13350 = getelementptr inbounds float* %tmp13349, i64 1
+  %tmp13351 = getelementptr inbounds float* %tmp13350, i64 1
+  %tmp13352 = getelementptr inbounds float* %tmp13351, i64 1
+  %tmp13353 = getelementptr inbounds float* %tmp13352, i64 1
+  %tmp13354 = getelementptr inbounds float* %tmp13353, i64 1
+  %tmp13355 = getelementptr inbounds float* %tmp13354, i64 1
+  %tmp13356 = getelementptr inbounds float* %tmp13355, i64 1
+  %tmp13357 = getelementptr inbounds float* %tmp13356, i64 1
+  %tmp13358 = getelementptr inbounds float* %tmp13357, i64 1
+  %tmp13359 = getelementptr inbounds float* %tmp13358, i64 1
+  %tmp13360 = getelementptr inbounds float* %tmp13359, i64 1
+  %tmp13361 = getelementptr inbounds float* %tmp13360, i64 1
+  %tmp13362 = getelementptr inbounds float* %tmp13361, i64 1
+  %tmp13363 = getelementptr inbounds float* %tmp13362, i64 1
+  %tmp13364 = getelementptr inbounds float* %tmp13363, i64 1
+  %tmp13365 = getelementptr inbounds float* %tmp13364, i64 1
+  %tmp13366 = getelementptr inbounds float* %tmp13365, i64 1
+  %tmp13367 = getelementptr inbounds float* %tmp13366, i64 1
+  %tmp13368 = getelementptr inbounds float* %tmp13367, i64 1
+  %tmp13369 = getelementptr inbounds float* %tmp13368, i64 1
+  %tmp13370 = getelementptr inbounds float* %tmp13369, i64 1
+  %tmp13371 = getelementptr inbounds float* %tmp13370, i64 1
+  %tmp13372 = getelementptr inbounds float* %tmp13371, i64 1
+  %tmp13373 = getelementptr inbounds float* %tmp13372, i64 1
+  %tmp13374 = getelementptr inbounds float* %tmp13373, i64 1
+  %tmp13375 = getelementptr inbounds float* %tmp13374, i64 1
+  %tmp13376 = getelementptr inbounds float* %tmp13375, i64 1
+  %tmp13377 = getelementptr inbounds float* %tmp13376, i64 1
+  %tmp13378 = getelementptr inbounds float* %tmp13377, i64 1
+  %tmp13379 = getelementptr inbounds float* %tmp13378, i64 1
+  %tmp13380 = getelementptr inbounds float* %tmp13379, i64 1
+  %tmp13381 = getelementptr inbounds float* %tmp13380, i64 1
+  %tmp13382 = getelementptr inbounds float* %tmp13381, i64 1
+  %tmp13383 = getelementptr inbounds float* %tmp13382, i64 1
+  %tmp13384 = getelementptr inbounds float* %tmp13383, i64 1
+  %tmp13385 = getelementptr inbounds float* %tmp13384, i64 1
+  %tmp13386 = getelementptr inbounds float* %tmp13385, i64 1
+  %tmp13387 = getelementptr inbounds float* %tmp13386, i64 1
+  %tmp13388 = getelementptr inbounds float* %tmp13387, i64 1
+  %tmp13389 = getelementptr inbounds float* %tmp13388, i64 1
+  %tmp13390 = getelementptr inbounds float* %tmp13389, i64 1
+  %tmp13391 = getelementptr inbounds float* %tmp13390, i64 1
+  %tmp13392 = getelementptr inbounds float* %tmp13391, i64 1
+  %tmp13393 = getelementptr inbounds float* %tmp13392, i64 1
+  %tmp13394 = getelementptr inbounds float* %tmp13393, i64 1
+  %tmp13395 = getelementptr inbounds float* %tmp13394, i64 1
+  %tmp13396 = getelementptr inbounds float* %tmp13395, i64 1
+  %tmp13397 = getelementptr inbounds float* %tmp13396, i64 1
+  %tmp13398 = getelementptr inbounds float* %tmp13397, i64 1
+  %tmp13399 = getelementptr inbounds float* %tmp13398, i64 1
+  %tmp13400 = getelementptr inbounds float* %tmp13399, i64 1
+  %tmp13401 = getelementptr inbounds float* %tmp13400, i64 1
+  %tmp13402 = getelementptr inbounds float* %tmp13401, i64 1
+  %tmp13403 = getelementptr inbounds float* %tmp13402, i64 1
+  %tmp13404 = getelementptr inbounds float* %tmp13403, i64 1
+  %tmp13405 = getelementptr inbounds float* %tmp13404, i64 1
+  %tmp13406 = getelementptr inbounds float* %tmp13405, i64 1
+  %tmp13407 = getelementptr inbounds float* %tmp13406, i64 1
+  %tmp13408 = getelementptr inbounds float* %tmp13407, i64 1
+  %tmp13409 = getelementptr inbounds float* %tmp13408, i64 1
+  %tmp13410 = getelementptr inbounds float* %tmp13409, i64 1
+  %tmp13411 = getelementptr inbounds float* %tmp13410, i64 1
+  %tmp13412 = getelementptr inbounds float* %tmp13411, i64 1
+  %tmp13413 = getelementptr inbounds float* %tmp13412, i64 1
+  %tmp13414 = getelementptr inbounds float* %tmp13413, i64 1
+  %tmp13415 = getelementptr inbounds float* %tmp13414, i64 1
+  %tmp13416 = getelementptr inbounds float* %tmp13415, i64 1
+  %tmp13417 = getelementptr inbounds float* %tmp13416, i64 1
+  %tmp13418 = getelementptr inbounds float* %tmp13417, i64 1
+  %tmp13419 = getelementptr inbounds float* %tmp13418, i64 1
+  %tmp13420 = getelementptr inbounds float* %tmp13419, i64 1
+  %tmp13421 = getelementptr inbounds float* %tmp13420, i64 1
+  %tmp13422 = getelementptr inbounds float* %tmp13421, i64 1
+  %tmp13423 = getelementptr inbounds float* %tmp13422, i64 1
+  %tmp13424 = getelementptr inbounds float* %tmp13423, i64 1
+  %tmp13425 = getelementptr inbounds float* %tmp13424, i64 1
+  %tmp13426 = getelementptr inbounds float* %tmp13425, i64 1
+  %tmp13427 = getelementptr inbounds float* %tmp13426, i64 1
+  %tmp13428 = getelementptr inbounds float* %tmp13427, i64 1
+  %tmp13429 = getelementptr inbounds float* %tmp13428, i64 1
+  %tmp13430 = getelementptr inbounds float* %tmp13429, i64 1
+  %tmp13431 = getelementptr inbounds float* %tmp13430, i64 1
+  %tmp13432 = getelementptr inbounds float* %tmp13431, i64 1
+  %tmp13433 = getelementptr inbounds float* %tmp13432, i64 1
+  %tmp13434 = getelementptr inbounds float* %tmp13433, i64 1
+  %tmp13435 = getelementptr inbounds float* %tmp13434, i64 1
+  %tmp13436 = getelementptr inbounds float* %tmp13435, i64 1
+  %tmp13437 = getelementptr inbounds float* %tmp13436, i64 1
+  %tmp13438 = getelementptr inbounds float* %tmp13437, i64 1
+  %tmp13439 = getelementptr inbounds float* %tmp13438, i64 1
+  %tmp13440 = getelementptr inbounds float* %tmp13439, i64 1
+  %tmp13441 = getelementptr inbounds float* %tmp13440, i64 1
+  %tmp13442 = getelementptr inbounds float* %tmp13441, i64 1
+  %tmp13443 = getelementptr inbounds float* %tmp13442, i64 1
+  %tmp13444 = getelementptr inbounds float* %tmp13443, i64 1
+  %tmp13445 = getelementptr inbounds float* %tmp13444, i64 1
+  %tmp13446 = getelementptr inbounds float* %tmp13445, i64 1
+  %tmp13447 = getelementptr inbounds float* %tmp13446, i64 1
+  %tmp13448 = getelementptr inbounds float* %tmp13447, i64 1
+  %tmp13449 = getelementptr inbounds float* %tmp13448, i64 1
+  %tmp13450 = getelementptr inbounds float* %tmp13449, i64 1
+  %tmp13451 = getelementptr inbounds float* %tmp13450, i64 1
+  %tmp13452 = getelementptr inbounds float* %tmp13451, i64 1
+  %tmp13453 = getelementptr inbounds float* %tmp13452, i64 1
+  %tmp13454 = getelementptr inbounds float* %tmp13453, i64 1
+  %tmp13455 = getelementptr inbounds float* %tmp13454, i64 1
+  %tmp13456 = getelementptr inbounds float* %tmp13455, i64 1
+  %tmp13457 = getelementptr inbounds float* %tmp13456, i64 1
+  %tmp13458 = getelementptr inbounds float* %tmp13457, i64 1
+  %tmp13459 = getelementptr inbounds float* %tmp13458, i64 1
+  %tmp13460 = getelementptr inbounds float* %tmp13459, i64 1
+  %tmp13461 = getelementptr inbounds float* %tmp13460, i64 1
+  %tmp13462 = getelementptr inbounds float* %tmp13461, i64 1
+  %tmp13463 = getelementptr inbounds float* %tmp13462, i64 1
+  %tmp13464 = getelementptr inbounds float* %tmp13463, i64 1
+  %tmp13465 = getelementptr inbounds float* %tmp13464, i64 1
+  %tmp13466 = getelementptr inbounds float* %tmp13465, i64 1
+  %tmp13467 = getelementptr inbounds float* %tmp13466, i64 1
+  %tmp13468 = getelementptr inbounds float* %tmp13467, i64 1
+  %tmp13469 = getelementptr inbounds float* %tmp13468, i64 1
+  %tmp13470 = getelementptr inbounds float* %tmp13469, i64 1
+  %tmp13471 = getelementptr inbounds float* %tmp13470, i64 1
+  %tmp13472 = getelementptr inbounds float* %tmp13471, i64 1
+  %tmp13473 = getelementptr inbounds float* %tmp13472, i64 1
+  %tmp13474 = getelementptr inbounds float* %tmp13473, i64 1
+  %tmp13475 = getelementptr inbounds float* %tmp13474, i64 1
+  %tmp13476 = getelementptr inbounds float* %tmp13475, i64 1
+  %tmp13477 = getelementptr inbounds float* %tmp13476, i64 1
+  %tmp13478 = getelementptr inbounds float* %tmp13477, i64 1
+  %tmp13479 = getelementptr inbounds float* %tmp13478, i64 1
+  %tmp13480 = getelementptr inbounds float* %tmp13479, i64 1
+  %tmp13481 = getelementptr inbounds float* %tmp13480, i64 1
+  %tmp13482 = getelementptr inbounds float* %tmp13481, i64 1
+  %tmp13483 = getelementptr inbounds float* %tmp13482, i64 1
+  %tmp13484 = getelementptr inbounds float* %tmp13483, i64 1
+  %tmp13485 = getelementptr inbounds float* %tmp13484, i64 1
+  %tmp13486 = getelementptr inbounds float* %tmp13485, i64 1
+  %tmp13487 = getelementptr inbounds float* %tmp13486, i64 1
+  %tmp13488 = getelementptr inbounds float* %tmp13487, i64 1
+  %tmp13489 = getelementptr inbounds float* %tmp13488, i64 1
+  %tmp13490 = getelementptr inbounds float* %tmp13489, i64 1
+  %tmp13491 = getelementptr inbounds float* %tmp13490, i64 1
+  %tmp13492 = getelementptr inbounds float* %tmp13491, i64 1
+  %tmp13493 = getelementptr inbounds float* %tmp13492, i64 1
+  %tmp13494 = getelementptr inbounds float* %tmp13493, i64 1
+  %tmp13495 = getelementptr inbounds float* %tmp13494, i64 1
+  %tmp13496 = getelementptr inbounds float* %tmp13495, i64 1
+  %tmp13497 = getelementptr inbounds float* %tmp13496, i64 1
+  %tmp13498 = getelementptr inbounds float* %tmp13497, i64 1
+  %tmp13499 = getelementptr inbounds float* %tmp13498, i64 1
+  %tmp13500 = getelementptr inbounds float* %tmp13499, i64 1
+  %tmp13501 = getelementptr inbounds float* %tmp13500, i64 1
+  %tmp13502 = getelementptr inbounds float* %tmp13501, i64 1
+  %tmp13503 = getelementptr inbounds float* %tmp13502, i64 1
+  %tmp13504 = getelementptr inbounds float* %tmp13503, i64 1
+  %tmp13505 = getelementptr inbounds float* %tmp13504, i64 1
+  %tmp13506 = getelementptr inbounds float* %tmp13505, i64 1
+  %tmp13507 = getelementptr inbounds float* %tmp13506, i64 1
+  %tmp13508 = getelementptr inbounds float* %tmp13507, i64 1
+  %tmp13509 = getelementptr inbounds float* %tmp13508, i64 1
+  %tmp13510 = getelementptr inbounds float* %tmp13509, i64 1
+  %tmp13511 = getelementptr inbounds float* %tmp13510, i64 1
+  %tmp13512 = getelementptr inbounds float* %tmp13511, i64 1
+  %tmp13513 = getelementptr inbounds float* %tmp13512, i64 1
+  %tmp13514 = getelementptr inbounds float* %tmp13513, i64 1
+  %tmp13515 = getelementptr inbounds float* %tmp13514, i64 1
+  %tmp13516 = getelementptr inbounds float* %tmp13515, i64 1
+  %tmp13517 = getelementptr inbounds float* %tmp13516, i64 1
+  %tmp13518 = getelementptr inbounds float* %tmp13517, i64 1
+  %tmp13519 = getelementptr inbounds float* %tmp13518, i64 1
+  %tmp13520 = getelementptr inbounds float* %tmp13519, i64 1
+  %tmp13521 = getelementptr inbounds float* %tmp13520, i64 1
+  %tmp13522 = getelementptr inbounds float* %tmp13521, i64 1
+  %tmp13523 = getelementptr inbounds float* %tmp13522, i64 1
+  %tmp13524 = getelementptr inbounds float* %tmp13523, i64 1
+  %tmp13525 = getelementptr inbounds float* %tmp13524, i64 1
+  %tmp13526 = getelementptr inbounds float* %tmp13525, i64 1
+  %tmp13527 = getelementptr inbounds float* %tmp13526, i64 1
+  %tmp13528 = getelementptr inbounds float* %tmp13527, i64 1
+  %tmp13529 = getelementptr inbounds float* %tmp13528, i64 1
+  %tmp13530 = getelementptr inbounds float* %tmp13529, i64 1
+  %tmp13531 = getelementptr inbounds float* %tmp13530, i64 1
+  %tmp13532 = getelementptr inbounds float* %tmp13531, i64 1
+  %tmp13533 = getelementptr inbounds float* %tmp13532, i64 1
+  %tmp13534 = getelementptr inbounds float* %tmp13533, i64 1
+  %tmp13535 = getelementptr inbounds float* %tmp13534, i64 1
+  %tmp13536 = getelementptr inbounds float* %tmp13535, i64 1
+  %tmp13537 = getelementptr inbounds float* %tmp13536, i64 1
+  %tmp13538 = getelementptr inbounds float* %tmp13537, i64 1
+  %tmp13539 = getelementptr inbounds float* %tmp13538, i64 1
+  %tmp13540 = getelementptr inbounds float* %tmp13539, i64 1
+  %tmp13541 = getelementptr inbounds float* %tmp13540, i64 1
+  %tmp13542 = getelementptr inbounds float* %tmp13541, i64 1
+  %tmp13543 = getelementptr inbounds float* %tmp13542, i64 1
+  %tmp13544 = getelementptr inbounds float* %tmp13543, i64 1
+  %tmp13545 = getelementptr inbounds float* %tmp13544, i64 1
+  %tmp13546 = getelementptr inbounds float* %tmp13545, i64 1
+  %tmp13547 = getelementptr inbounds float* %tmp13546, i64 1
+  %tmp13548 = getelementptr inbounds float* %tmp13547, i64 1
+  %tmp13549 = getelementptr inbounds float* %tmp13548, i64 1
+  %tmp13550 = getelementptr inbounds float* %tmp13549, i64 1
+  %tmp13551 = getelementptr inbounds float* %tmp13550, i64 1
+  %tmp13552 = getelementptr inbounds float* %tmp13551, i64 1
+  %tmp13553 = getelementptr inbounds float* %tmp13552, i64 1
+  %tmp13554 = getelementptr inbounds float* %tmp13553, i64 1
+  %tmp13555 = getelementptr inbounds float* %tmp13554, i64 1
+  %tmp13556 = getelementptr inbounds float* %tmp13555, i64 1
+  %tmp13557 = getelementptr inbounds float* %tmp13556, i64 1
+  %tmp13558 = getelementptr inbounds float* %tmp13557, i64 1
+  %tmp13559 = getelementptr inbounds float* %tmp13558, i64 1
+  %tmp13560 = getelementptr inbounds float* %tmp13559, i64 1
+  %tmp13561 = getelementptr inbounds float* %tmp13560, i64 1
+  %tmp13562 = getelementptr inbounds float* %tmp13561, i64 1
+  %tmp13563 = getelementptr inbounds float* %tmp13562, i64 1
+  %tmp13564 = getelementptr inbounds float* %tmp13563, i64 1
+  %tmp13565 = getelementptr inbounds float* %tmp13564, i64 1
+  %tmp13566 = getelementptr inbounds float* %tmp13565, i64 1
+  %tmp13567 = getelementptr inbounds float* %tmp13566, i64 1
+  %tmp13568 = getelementptr inbounds float* %tmp13567, i64 1
+  %tmp13569 = getelementptr inbounds float* %tmp13568, i64 1
+  %tmp13570 = getelementptr inbounds float* %tmp13569, i64 1
+  %tmp13571 = getelementptr inbounds float* %tmp13570, i64 1
+  %tmp13572 = getelementptr inbounds float* %tmp13571, i64 1
+  %tmp13573 = getelementptr inbounds float* %tmp13572, i64 1
+  %tmp13574 = getelementptr inbounds float* %tmp13573, i64 1
+  %tmp13575 = getelementptr inbounds float* %tmp13574, i64 1
+  %tmp13576 = getelementptr inbounds float* %tmp13575, i64 1
+  %tmp13577 = getelementptr inbounds float* %tmp13576, i64 1
+  %tmp13578 = getelementptr inbounds float* %tmp13577, i64 1
+  %tmp13579 = getelementptr inbounds float* %tmp13578, i64 1
+  %tmp13580 = getelementptr inbounds float* %tmp13579, i64 1
+  %tmp13581 = getelementptr inbounds float* %tmp13580, i64 1
+  %tmp13582 = getelementptr inbounds float* %tmp13581, i64 1
+  %tmp13583 = getelementptr inbounds float* %tmp13582, i64 1
+  %tmp13584 = getelementptr inbounds float* %tmp13583, i64 1
+  %tmp13585 = getelementptr inbounds float* %tmp13584, i64 1
+  %tmp13586 = getelementptr inbounds float* %tmp13585, i64 1
+  %tmp13587 = getelementptr inbounds float* %tmp13586, i64 1
+  %tmp13588 = getelementptr inbounds float* %tmp13587, i64 1
+  %tmp13589 = getelementptr inbounds float* %tmp13588, i64 1
+  %tmp13590 = getelementptr inbounds float* %tmp13589, i64 1
+  %tmp13591 = getelementptr inbounds float* %tmp13590, i64 1
+  %tmp13592 = getelementptr inbounds float* %tmp13591, i64 1
+  %tmp13593 = getelementptr inbounds float* %tmp13592, i64 1
+  %tmp13594 = getelementptr inbounds float* %tmp13593, i64 1
+  %tmp13595 = getelementptr inbounds float* %tmp13594, i64 1
+  %tmp13596 = getelementptr inbounds float* %tmp13595, i64 1
+  %tmp13597 = getelementptr inbounds float* %tmp13596, i64 1
+  %tmp13598 = getelementptr inbounds float* %tmp13597, i64 1
+  %tmp13599 = getelementptr inbounds float* %tmp13598, i64 1
+  %tmp13600 = getelementptr inbounds float* %tmp13599, i64 1
+  %tmp13601 = getelementptr inbounds float* %tmp13600, i64 1
+  %tmp13602 = getelementptr inbounds float* %tmp13601, i64 1
+  %tmp13603 = getelementptr inbounds float* %tmp13602, i64 1
+  %tmp13604 = getelementptr inbounds float* %tmp13603, i64 1
+  %tmp13605 = getelementptr inbounds float* %tmp13604, i64 1
+  %tmp13606 = getelementptr inbounds float* %tmp13605, i64 1
+  %tmp13607 = getelementptr inbounds float* %tmp13606, i64 1
+  %tmp13608 = getelementptr inbounds float* %tmp13607, i64 1
+  %tmp13609 = getelementptr inbounds float* %tmp13608, i64 1
+  %tmp13610 = getelementptr inbounds float* %tmp13609, i64 1
+  %tmp13611 = getelementptr inbounds float* %tmp13610, i64 1
+  %tmp13612 = getelementptr inbounds float* %tmp13611, i64 1
+  %tmp13613 = getelementptr inbounds float* %tmp13612, i64 1
+  %tmp13614 = getelementptr inbounds float* %tmp13613, i64 1
+  %tmp13615 = getelementptr inbounds float* %tmp13614, i64 1
+  %tmp13616 = getelementptr inbounds float* %tmp13615, i64 1
+  %tmp13617 = getelementptr inbounds float* %tmp13616, i64 1
+  %tmp13618 = getelementptr inbounds float* %tmp13617, i64 1
+  %tmp13619 = getelementptr inbounds float* %tmp13618, i64 1
+  %tmp13620 = getelementptr inbounds float* %tmp13619, i64 1
+  %tmp13621 = getelementptr inbounds float* %tmp13620, i64 1
+  %tmp13622 = getelementptr inbounds float* %tmp13621, i64 1
+  %tmp13623 = getelementptr inbounds float* %tmp13622, i64 1
+  %tmp13624 = getelementptr inbounds float* %tmp13623, i64 1
+  %tmp13625 = getelementptr inbounds float* %tmp13624, i64 1
+  %tmp13626 = getelementptr inbounds float* %tmp13625, i64 1
+  %tmp13627 = getelementptr inbounds float* %tmp13626, i64 1
+  %tmp13628 = getelementptr inbounds float* %tmp13627, i64 1
+  %tmp13629 = getelementptr inbounds float* %tmp13628, i64 1
+  %tmp13630 = getelementptr inbounds float* %tmp13629, i64 1
+  %tmp13631 = getelementptr inbounds float* %tmp13630, i64 1
+  %tmp13632 = getelementptr inbounds float* %tmp13631, i64 1
+  %tmp13633 = getelementptr inbounds float* %tmp13632, i64 1
+  %tmp13634 = getelementptr inbounds float* %tmp13633, i64 1
+  %tmp13635 = getelementptr inbounds float* %tmp13634, i64 1
+  %tmp13636 = getelementptr inbounds float* %tmp13635, i64 1
+  %tmp13637 = getelementptr inbounds float* %tmp13636, i64 1
+  %tmp13638 = getelementptr inbounds float* %tmp13637, i64 1
+  %tmp13639 = getelementptr inbounds float* %tmp13638, i64 1
+  %tmp13640 = getelementptr inbounds float* %tmp13639, i64 1
+  %tmp13641 = getelementptr inbounds float* %tmp13640, i64 1
+  %tmp13642 = getelementptr inbounds float* %tmp13641, i64 1
+  %tmp13643 = getelementptr inbounds float* %tmp13642, i64 1
+  %tmp13644 = getelementptr inbounds float* %tmp13643, i64 1
+  %tmp13645 = getelementptr inbounds float* %tmp13644, i64 1
+  %tmp13646 = getelementptr inbounds float* %tmp13645, i64 1
+  %tmp13647 = getelementptr inbounds float* %tmp13646, i64 1
+  %tmp13648 = getelementptr inbounds float* %tmp13647, i64 1
+  %tmp13649 = getelementptr inbounds float* %tmp13648, i64 1
+  %tmp13650 = getelementptr inbounds float* %tmp13649, i64 1
+  %tmp13651 = getelementptr inbounds float* %tmp13650, i64 1
+  %tmp13652 = getelementptr inbounds float* %tmp13651, i64 1
+  %tmp13653 = getelementptr inbounds float* %tmp13652, i64 1
+  %tmp13654 = getelementptr inbounds float* %tmp13653, i64 1
+  %tmp13655 = getelementptr inbounds float* %tmp13654, i64 1
+  %tmp13656 = getelementptr inbounds float* %tmp13655, i64 1
+  %tmp13657 = getelementptr inbounds float* %tmp13656, i64 1
+  %tmp13658 = getelementptr inbounds float* %tmp13657, i64 1
+  %tmp13659 = getelementptr inbounds float* %tmp13658, i64 1
+  %tmp13660 = getelementptr inbounds float* %tmp13659, i64 1
+  %tmp13661 = getelementptr inbounds float* %tmp13660, i64 1
+  %tmp13662 = getelementptr inbounds float* %tmp13661, i64 1
+  %tmp13663 = getelementptr inbounds float* %tmp13662, i64 1
+  %tmp13664 = getelementptr inbounds float* %tmp13663, i64 1
+  %tmp13665 = getelementptr inbounds float* %tmp13664, i64 1
+  %tmp13666 = getelementptr inbounds float* %tmp13665, i64 1
+  %tmp13667 = getelementptr inbounds float* %tmp13666, i64 1
+  %tmp13668 = getelementptr inbounds float* %tmp13667, i64 1
+  %tmp13669 = getelementptr inbounds float* %tmp13668, i64 1
+  %tmp13670 = getelementptr inbounds float* %tmp13669, i64 1
+  %tmp13671 = getelementptr inbounds float* %tmp13670, i64 1
+  %tmp13672 = getelementptr inbounds float* %tmp13671, i64 1
+  %tmp13673 = getelementptr inbounds float* %tmp13672, i64 1
+  %tmp13674 = getelementptr inbounds float* %tmp13673, i64 1
+  %tmp13675 = getelementptr inbounds float* %tmp13674, i64 1
+  %tmp13676 = getelementptr inbounds float* %tmp13675, i64 1
+  %tmp13677 = getelementptr inbounds float* %tmp13676, i64 1
+  %tmp13678 = getelementptr inbounds float* %tmp13677, i64 1
+  %tmp13679 = getelementptr inbounds float* %tmp13678, i64 1
+  %tmp13680 = getelementptr inbounds float* %tmp13679, i64 1
+  %tmp13681 = getelementptr inbounds float* %tmp13680, i64 1
+  %tmp13682 = getelementptr inbounds float* %tmp13681, i64 1
+  %tmp13683 = getelementptr inbounds float* %tmp13682, i64 1
+  %tmp13684 = getelementptr inbounds float* %tmp13683, i64 1
+  %tmp13685 = getelementptr inbounds float* %tmp13684, i64 1
+  %tmp13686 = getelementptr inbounds float* %tmp13685, i64 1
+  %tmp13687 = getelementptr inbounds float* %tmp13686, i64 1
+  %tmp13688 = getelementptr inbounds float* %tmp13687, i64 1
+  %tmp13689 = getelementptr inbounds float* %tmp13688, i64 1
+  %tmp13690 = getelementptr inbounds float* %tmp13689, i64 1
+  %tmp13691 = getelementptr inbounds float* %tmp13690, i64 1
+  %tmp13692 = getelementptr inbounds float* %tmp13691, i64 1
+  %tmp13693 = getelementptr inbounds float* %tmp13692, i64 1
+  %tmp13694 = getelementptr inbounds float* %tmp13693, i64 1
+  %tmp13695 = getelementptr inbounds float* %tmp13694, i64 1
+  %tmp13696 = getelementptr inbounds float* %tmp13695, i64 1
+  %tmp13697 = getelementptr inbounds float* %tmp13696, i64 1
+  %tmp13698 = getelementptr inbounds float* %tmp13697, i64 1
+  %tmp13699 = getelementptr inbounds float* %tmp13698, i64 1
+  %tmp13700 = getelementptr inbounds float* %tmp13699, i64 1
+  %tmp13701 = getelementptr inbounds float* %tmp13700, i64 1
+  %tmp13702 = getelementptr inbounds float* %tmp13701, i64 1
+  %tmp13703 = getelementptr inbounds float* %tmp13702, i64 1
+  %tmp13704 = getelementptr inbounds float* %tmp13703, i64 1
+  %tmp13705 = getelementptr inbounds float* %tmp13704, i64 1
+  %tmp13706 = getelementptr inbounds float* %tmp13705, i64 1
+  %tmp13707 = getelementptr inbounds float* %tmp13706, i64 1
+  %tmp13708 = getelementptr inbounds float* %tmp13707, i64 1
+  %tmp13709 = getelementptr inbounds float* %tmp13708, i64 1
+  %tmp13710 = getelementptr inbounds float* %tmp13709, i64 1
+  %tmp13711 = getelementptr inbounds float* %tmp13710, i64 1
+  %tmp13712 = getelementptr inbounds float* %tmp13711, i64 1
+  %tmp13713 = getelementptr inbounds float* %tmp13712, i64 1
+  %tmp13714 = getelementptr inbounds float* %tmp13713, i64 1
+  %tmp13715 = getelementptr inbounds float* %tmp13714, i64 1
+  %tmp13716 = getelementptr inbounds float* %tmp13715, i64 1
+  %tmp13717 = getelementptr inbounds float* %tmp13716, i64 1
+  %tmp13718 = getelementptr inbounds float* %tmp13717, i64 1
+  %tmp13719 = getelementptr inbounds float* %tmp13718, i64 1
+  %tmp13720 = getelementptr inbounds float* %tmp13719, i64 1
+  %tmp13721 = getelementptr inbounds float* %tmp13720, i64 1
+  %tmp13722 = getelementptr inbounds float* %tmp13721, i64 1
+  %tmp13723 = getelementptr inbounds float* %tmp13722, i64 1
+  %tmp13724 = getelementptr inbounds float* %tmp13723, i64 1
+  %tmp13725 = getelementptr inbounds float* %tmp13724, i64 1
+  %tmp13726 = getelementptr inbounds float* %tmp13725, i64 1
+  %tmp13727 = getelementptr inbounds float* %tmp13726, i64 1
+  %tmp13728 = getelementptr inbounds float* %tmp13727, i64 1
+  %tmp13729 = getelementptr inbounds float* %tmp13728, i64 1
+  %tmp13730 = getelementptr inbounds float* %tmp13729, i64 1
+  %tmp13731 = getelementptr inbounds float* %tmp13730, i64 1
+  %tmp13732 = getelementptr inbounds float* %tmp13731, i64 1
+  %tmp13733 = getelementptr inbounds float* %tmp13732, i64 1
+  %tmp13734 = getelementptr inbounds float* %tmp13733, i64 1
+  %tmp13735 = getelementptr inbounds float* %tmp13734, i64 1
+  %tmp13736 = getelementptr inbounds float* %tmp13735, i64 1
+  %tmp13737 = getelementptr inbounds float* %tmp13736, i64 1
+  %tmp13738 = getelementptr inbounds float* %tmp13737, i64 1
+  %tmp13739 = getelementptr inbounds float* %tmp13738, i64 1
+  %tmp13740 = getelementptr inbounds float* %tmp13739, i64 1
+  %tmp13741 = getelementptr inbounds float* %tmp13740, i64 1
+  %tmp13742 = getelementptr inbounds float* %tmp13741, i64 1
+  %tmp13743 = getelementptr inbounds float* %tmp13742, i64 1
+  %tmp13744 = getelementptr inbounds float* %tmp13743, i64 1
+  %tmp13745 = getelementptr inbounds float* %tmp13744, i64 1
+  %tmp13746 = getelementptr inbounds float* %tmp13745, i64 1
+  %tmp13747 = getelementptr inbounds float* %tmp13746, i64 1
+  %tmp13748 = getelementptr inbounds float* %tmp13747, i64 1
+  %tmp13749 = getelementptr inbounds float* %tmp13748, i64 1
+  %tmp13750 = getelementptr inbounds float* %tmp13749, i64 1
+  %tmp13751 = getelementptr inbounds float* %tmp13750, i64 1
+  %tmp13752 = getelementptr inbounds float* %tmp13751, i64 1
+  %tmp13753 = getelementptr inbounds float* %tmp13752, i64 1
+  %tmp13754 = getelementptr inbounds float* %tmp13753, i64 1
+  %tmp13755 = getelementptr inbounds float* %tmp13754, i64 1
+  %tmp13756 = getelementptr inbounds float* %tmp13755, i64 1
+  %tmp13757 = getelementptr inbounds float* %tmp13756, i64 1
+  %tmp13758 = getelementptr inbounds float* %tmp13757, i64 1
+  %tmp13759 = getelementptr inbounds float* %tmp13758, i64 1
+  %tmp13760 = getelementptr inbounds float* %tmp13759, i64 1
+  %tmp13761 = getelementptr inbounds float* %tmp13760, i64 1
+  %tmp13762 = getelementptr inbounds float* %tmp13761, i64 1
+  %tmp13763 = getelementptr inbounds float* %tmp13762, i64 1
+  %tmp13764 = getelementptr inbounds float* %tmp13763, i64 1
+  %tmp13765 = getelementptr inbounds float* %tmp13764, i64 1
+  %tmp13766 = getelementptr inbounds float* %tmp13765, i64 1
+  %tmp13767 = getelementptr inbounds float* %tmp13766, i64 1
+  %tmp13768 = getelementptr inbounds float* %tmp13767, i64 1
+  %tmp13769 = getelementptr inbounds float* %tmp13768, i64 1
+  %tmp13770 = getelementptr inbounds float* %tmp13769, i64 1
+  %tmp13771 = getelementptr inbounds float* %tmp13770, i64 1
+  %tmp13772 = getelementptr inbounds float* %tmp13771, i64 1
+  %tmp13773 = getelementptr inbounds float* %tmp13772, i64 1
+  %tmp13774 = getelementptr inbounds float* %tmp13773, i64 1
+  %tmp13775 = getelementptr inbounds float* %tmp13774, i64 1
+  %tmp13776 = getelementptr inbounds float* %tmp13775, i64 1
+  %tmp13777 = getelementptr inbounds float* %tmp13776, i64 1
+  %tmp13778 = getelementptr inbounds float* %tmp13777, i64 1
+  %tmp13779 = getelementptr inbounds float* %tmp13778, i64 1
+  %tmp13780 = getelementptr inbounds float* %tmp13779, i64 1
+  %tmp13781 = getelementptr inbounds float* %tmp13780, i64 1
+  %tmp13782 = getelementptr inbounds float* %tmp13781, i64 1
+  %tmp13783 = getelementptr inbounds float* %tmp13782, i64 1
+  %tmp13784 = getelementptr inbounds float* %tmp13783, i64 1
+  %tmp13785 = getelementptr inbounds float* %tmp13784, i64 1
+  %tmp13786 = getelementptr inbounds float* %tmp13785, i64 1
+  %tmp13787 = getelementptr inbounds float* %tmp13786, i64 1
+  %tmp13788 = getelementptr inbounds float* %tmp13787, i64 1
+  %tmp13789 = getelementptr inbounds float* %tmp13788, i64 1
+  %tmp13790 = getelementptr inbounds float* %tmp13789, i64 1
+  %tmp13791 = getelementptr inbounds float* %tmp13790, i64 1
+  %tmp13792 = getelementptr inbounds float* %tmp13791, i64 1
+  %tmp13793 = getelementptr inbounds float* %tmp13792, i64 1
+  %tmp13794 = getelementptr inbounds float* %tmp13793, i64 1
+  %tmp13795 = getelementptr inbounds float* %tmp13794, i64 1
+  %tmp13796 = getelementptr inbounds float* %tmp13795, i64 1
+  %tmp13797 = getelementptr inbounds float* %tmp13796, i64 1
+  %tmp13798 = getelementptr inbounds float* %tmp13797, i64 1
+  %tmp13799 = getelementptr inbounds float* %tmp13798, i64 1
+  %tmp13800 = getelementptr inbounds float* %tmp13799, i64 1
+  %tmp13801 = getelementptr inbounds float* %tmp13800, i64 1
+  %tmp13802 = getelementptr inbounds float* %tmp13801, i64 1
+  %tmp13803 = getelementptr inbounds float* %tmp13802, i64 1
+  %tmp13804 = getelementptr inbounds float* %tmp13803, i64 1
+  %tmp13805 = getelementptr inbounds float* %tmp13804, i64 1
+  %tmp13806 = getelementptr inbounds float* %tmp13805, i64 1
+  %tmp13807 = getelementptr inbounds float* %tmp13806, i64 1
+  %tmp13808 = getelementptr inbounds float* %tmp13807, i64 1
+  %tmp13809 = getelementptr inbounds float* %tmp13808, i64 1
+  %tmp13810 = getelementptr inbounds float* %tmp13809, i64 1
+  %tmp13811 = getelementptr inbounds float* %tmp13810, i64 1
+  %tmp13812 = getelementptr inbounds float* %tmp13811, i64 1
+  %tmp13813 = getelementptr inbounds float* %tmp13812, i64 1
+  %tmp13814 = getelementptr inbounds float* %tmp13813, i64 1
+  %tmp13815 = getelementptr inbounds float* %tmp13814, i64 1
+  %tmp13816 = getelementptr inbounds float* %tmp13815, i64 1
+  %tmp13817 = getelementptr inbounds float* %tmp13816, i64 1
+  %tmp13818 = getelementptr inbounds float* %tmp13817, i64 1
+  %tmp13819 = getelementptr inbounds float* %tmp13818, i64 1
+  %tmp13820 = getelementptr inbounds float* %tmp13819, i64 1
+  %tmp13821 = getelementptr inbounds float* %tmp13820, i64 1
+  %tmp13822 = getelementptr inbounds float* %tmp13821, i64 1
+  %tmp13823 = getelementptr inbounds float* %tmp13822, i64 1
+  %tmp13824 = getelementptr inbounds float* %tmp13823, i64 1
+  %tmp13825 = getelementptr inbounds float* %tmp13824, i64 1
+  %tmp13826 = getelementptr inbounds float* %tmp13825, i64 1
+  %tmp13827 = getelementptr inbounds float* %tmp13826, i64 1
+  %tmp13828 = getelementptr inbounds float* %tmp13827, i64 1
+  %tmp13829 = getelementptr inbounds float* %tmp13828, i64 1
+  %tmp13830 = getelementptr inbounds float* %tmp13829, i64 1
+  %tmp13831 = getelementptr inbounds float* %tmp13830, i64 1
+  %tmp13832 = getelementptr inbounds float* %tmp13831, i64 1
+  %tmp13833 = getelementptr inbounds float* %tmp13832, i64 1
+  %tmp13834 = getelementptr inbounds float* %tmp13833, i64 1
+  %tmp13835 = getelementptr inbounds float* %tmp13834, i64 1
+  %tmp13836 = getelementptr inbounds float* %tmp13835, i64 1
+  %tmp13837 = getelementptr inbounds float* %tmp13836, i64 1
+  %tmp13838 = getelementptr inbounds float* %tmp13837, i64 1
+  %tmp13839 = getelementptr inbounds float* %tmp13838, i64 1
+  %tmp13840 = getelementptr inbounds float* %tmp13839, i64 1
+  %tmp13841 = getelementptr inbounds float* %tmp13840, i64 1
+  %tmp13842 = getelementptr inbounds float* %tmp13841, i64 1
+  %tmp13843 = getelementptr inbounds float* %tmp13842, i64 1
+  %tmp13844 = getelementptr inbounds float* %tmp13843, i64 1
+  %tmp13845 = getelementptr inbounds float* %tmp13844, i64 1
+  %tmp13846 = getelementptr inbounds float* %tmp13845, i64 1
+  %tmp13847 = getelementptr inbounds float* %tmp13846, i64 1
+  %tmp13848 = getelementptr inbounds float* %tmp13847, i64 1
+  %tmp13849 = getelementptr inbounds float* %tmp13848, i64 1
+  %tmp13850 = getelementptr inbounds float* %tmp13849, i64 1
+  %tmp13851 = getelementptr inbounds float* %tmp13850, i64 1
+  %tmp13852 = getelementptr inbounds float* %tmp13851, i64 1
+  %tmp13853 = getelementptr inbounds float* %tmp13852, i64 1
+  %tmp13854 = getelementptr inbounds float* %tmp13853, i64 1
+  %tmp13855 = getelementptr inbounds float* %tmp13854, i64 1
+  %tmp13856 = getelementptr inbounds float* %tmp13855, i64 1
+  %tmp13857 = getelementptr inbounds float* %tmp13856, i64 1
+  %tmp13858 = getelementptr inbounds float* %tmp13857, i64 1
+  %tmp13859 = getelementptr inbounds float* %tmp13858, i64 1
+  %tmp13860 = getelementptr inbounds float* %tmp13859, i64 1
+  %tmp13861 = getelementptr inbounds float* %tmp13860, i64 1
+  %tmp13862 = getelementptr inbounds float* %tmp13861, i64 1
+  %tmp13863 = getelementptr inbounds float* %tmp13862, i64 1
+  %tmp13864 = getelementptr inbounds float* %tmp13863, i64 1
+  %tmp13865 = getelementptr inbounds float* %tmp13864, i64 1
+  %tmp13866 = getelementptr inbounds float* %tmp13865, i64 1
+  %tmp13867 = getelementptr inbounds float* %tmp13866, i64 1
+  %tmp13868 = getelementptr inbounds float* %tmp13867, i64 1
+  %tmp13869 = getelementptr inbounds float* %tmp13868, i64 1
+  %tmp13870 = getelementptr inbounds float* %tmp13869, i64 1
+  %tmp13871 = getelementptr inbounds float* %tmp13870, i64 1
+  %tmp13872 = getelementptr inbounds float* %tmp13871, i64 1
+  %tmp13873 = getelementptr inbounds float* %tmp13872, i64 1
+  %tmp13874 = getelementptr inbounds float* %tmp13873, i64 1
+  %tmp13875 = getelementptr inbounds float* %tmp13874, i64 1
+  %tmp13876 = getelementptr inbounds float* %tmp13875, i64 1
+  %tmp13877 = getelementptr inbounds float* %tmp13876, i64 1
+  %tmp13878 = getelementptr inbounds float* %tmp13877, i64 1
+  %tmp13879 = getelementptr inbounds float* %tmp13878, i64 1
+  %tmp13880 = getelementptr inbounds float* %tmp13879, i64 1
+  %tmp13881 = getelementptr inbounds float* %tmp13880, i64 1
+  %tmp13882 = getelementptr inbounds float* %tmp13881, i64 1
+  %tmp13883 = getelementptr inbounds float* %tmp13882, i64 1
+  %tmp13884 = getelementptr inbounds float* %tmp13883, i64 1
+  %tmp13885 = getelementptr inbounds float* %tmp13884, i64 1
+  %tmp13886 = getelementptr inbounds float* %tmp13885, i64 1
+  %tmp13887 = getelementptr inbounds float* %tmp13886, i64 1
+  %tmp13888 = getelementptr inbounds float* %tmp13887, i64 1
+  %tmp13889 = getelementptr inbounds float* %tmp13888, i64 1
+  %tmp13890 = getelementptr inbounds float* %tmp13889, i64 1
+  %tmp13891 = getelementptr inbounds float* %tmp13890, i64 1
+  %tmp13892 = getelementptr inbounds float* %tmp13891, i64 1
+  %tmp13893 = getelementptr inbounds float* %tmp13892, i64 1
+  %tmp13894 = getelementptr inbounds float* %tmp13893, i64 1
+  %tmp13895 = getelementptr inbounds float* %tmp13894, i64 1
+  %tmp13896 = getelementptr inbounds float* %tmp13895, i64 1
+  %tmp13897 = getelementptr inbounds float* %tmp13896, i64 1
+  %tmp13898 = getelementptr inbounds float* %tmp13897, i64 1
+  %tmp13899 = getelementptr inbounds float* %tmp13898, i64 1
+  %tmp13900 = getelementptr inbounds float* %tmp13899, i64 1
+  %tmp13901 = getelementptr inbounds float* %tmp13900, i64 1
+  %tmp13902 = getelementptr inbounds float* %tmp13901, i64 1
+  %tmp13903 = getelementptr inbounds float* %tmp13902, i64 1
+  %tmp13904 = getelementptr inbounds float* %tmp13903, i64 1
+  %tmp13905 = getelementptr inbounds float* %tmp13904, i64 1
+  %tmp13906 = getelementptr inbounds float* %tmp13905, i64 1
+  %tmp13907 = getelementptr inbounds float* %tmp13906, i64 1
+  %tmp13908 = getelementptr inbounds float* %tmp13907, i64 1
+  %tmp13909 = getelementptr inbounds float* %tmp13908, i64 1
+  %tmp13910 = getelementptr inbounds float* %tmp13909, i64 1
+  %tmp13911 = getelementptr inbounds float* %tmp13910, i64 1
+  %tmp13912 = getelementptr inbounds float* %tmp13911, i64 1
+  %tmp13913 = getelementptr inbounds float* %tmp13912, i64 1
+  %tmp13914 = getelementptr inbounds float* %tmp13913, i64 1
+  %tmp13915 = getelementptr inbounds float* %tmp13914, i64 1
+  %tmp13916 = getelementptr inbounds float* %tmp13915, i64 1
+  %tmp13917 = getelementptr inbounds float* %tmp13916, i64 1
+  %tmp13918 = getelementptr inbounds float* %tmp13917, i64 1
+  %tmp13919 = getelementptr inbounds float* %tmp13918, i64 1
+  %tmp13920 = getelementptr inbounds float* %tmp13919, i64 1
+  %tmp13921 = getelementptr inbounds float* %tmp13920, i64 1
+  %tmp13922 = getelementptr inbounds float* %tmp13921, i64 1
+  %tmp13923 = getelementptr inbounds float* %tmp13922, i64 1
+  %tmp13924 = getelementptr inbounds float* %tmp13923, i64 1
+  %tmp13925 = getelementptr inbounds float* %tmp13924, i64 1
+  %tmp13926 = getelementptr inbounds float* %tmp13925, i64 1
+  %tmp13927 = getelementptr inbounds float* %tmp13926, i64 1
+  %tmp13928 = getelementptr inbounds float* %tmp13927, i64 1
+  %tmp13929 = getelementptr inbounds float* %tmp13928, i64 1
+  %tmp13930 = getelementptr inbounds float* %tmp13929, i64 1
+  %tmp13931 = getelementptr inbounds float* %tmp13930, i64 1
+  %tmp13932 = getelementptr inbounds float* %tmp13931, i64 1
+  %tmp13933 = getelementptr inbounds float* %tmp13932, i64 1
+  %tmp13934 = getelementptr inbounds float* %tmp13933, i64 1
+  %tmp13935 = getelementptr inbounds float* %tmp13934, i64 1
+  %tmp13936 = getelementptr inbounds float* %tmp13935, i64 1
+  %tmp13937 = getelementptr inbounds float* %tmp13936, i64 1
+  %tmp13938 = getelementptr inbounds float* %tmp13937, i64 1
+  %tmp13939 = getelementptr inbounds float* %tmp13938, i64 1
+  %tmp13940 = getelementptr inbounds float* %tmp13939, i64 1
+  %tmp13941 = getelementptr inbounds float* %tmp13940, i64 1
+  %tmp13942 = getelementptr inbounds float* %tmp13941, i64 1
+  %tmp13943 = getelementptr inbounds float* %tmp13942, i64 1
+  %tmp13944 = getelementptr inbounds float* %tmp13943, i64 1
+  %tmp13945 = getelementptr inbounds float* %tmp13944, i64 1
+  %tmp13946 = getelementptr inbounds float* %tmp13945, i64 1
+  %tmp13947 = getelementptr inbounds float* %tmp13946, i64 1
+  %tmp13948 = getelementptr inbounds float* %tmp13947, i64 1
+  %tmp13949 = getelementptr inbounds float* %tmp13948, i64 1
+  %tmp13950 = getelementptr inbounds float* %tmp13949, i64 1
+  %tmp13951 = getelementptr inbounds float* %tmp13950, i64 1
+  %tmp13952 = getelementptr inbounds float* %tmp13951, i64 1
+  %tmp13953 = getelementptr inbounds float* %tmp13952, i64 1
+  %tmp13954 = getelementptr inbounds float* %tmp13953, i64 1
+  %tmp13955 = getelementptr inbounds float* %tmp13954, i64 1
+  %tmp13956 = getelementptr inbounds float* %tmp13955, i64 1
+  %tmp13957 = getelementptr inbounds float* %tmp13956, i64 1
+  %tmp13958 = getelementptr inbounds float* %tmp13957, i64 1
+  %tmp13959 = getelementptr inbounds float* %tmp13958, i64 1
+  %tmp13960 = getelementptr inbounds float* %tmp13959, i64 1
+  %tmp13961 = getelementptr inbounds float* %tmp13960, i64 1
+  %tmp13962 = getelementptr inbounds float* %tmp13961, i64 1
+  %tmp13963 = getelementptr inbounds float* %tmp13962, i64 1
+  %tmp13964 = getelementptr inbounds float* %tmp13963, i64 1
+  %tmp13965 = getelementptr inbounds float* %tmp13964, i64 1
+  %tmp13966 = getelementptr inbounds float* %tmp13965, i64 1
+  %tmp13967 = getelementptr inbounds float* %tmp13966, i64 1
+  %tmp13968 = getelementptr inbounds float* %tmp13967, i64 1
+  %tmp13969 = getelementptr inbounds float* %tmp13968, i64 1
+  %tmp13970 = getelementptr inbounds float* %tmp13969, i64 1
+  %tmp13971 = getelementptr inbounds float* %tmp13970, i64 1
+  %tmp13972 = getelementptr inbounds float* %tmp13971, i64 1
+  %tmp13973 = getelementptr inbounds float* %tmp13972, i64 1
+  %tmp13974 = getelementptr inbounds float* %tmp13973, i64 1
+  %tmp13975 = getelementptr inbounds float* %tmp13974, i64 1
+  %tmp13976 = getelementptr inbounds float* %tmp13975, i64 1
+  %tmp13977 = getelementptr inbounds float* %tmp13976, i64 1
+  %tmp13978 = getelementptr inbounds float* %tmp13977, i64 1
+  %tmp13979 = getelementptr inbounds float* %tmp13978, i64 1
+  %tmp13980 = getelementptr inbounds float* %tmp13979, i64 1
+  %tmp13981 = getelementptr inbounds float* %tmp13980, i64 1
+  %tmp13982 = getelementptr inbounds float* %tmp13981, i64 1
+  %tmp13983 = getelementptr inbounds float* %tmp13982, i64 1
+  %tmp13984 = getelementptr inbounds float* %tmp13983, i64 1
+  %tmp13985 = getelementptr inbounds float* %tmp13984, i64 1
+  %tmp13986 = getelementptr inbounds float* %tmp13985, i64 1
+  %tmp13987 = getelementptr inbounds float* %tmp13986, i64 1
+  %tmp13988 = getelementptr inbounds float* %tmp13987, i64 1
+  %tmp13989 = getelementptr inbounds float* %tmp13988, i64 1
+  %tmp13990 = getelementptr inbounds float* %tmp13989, i64 1
+  %tmp13991 = getelementptr inbounds float* %tmp13990, i64 1
+  %tmp13992 = getelementptr inbounds float* %tmp13991, i64 1
+  %tmp13993 = getelementptr inbounds float* %tmp13992, i64 1
+  %tmp13994 = getelementptr inbounds float* %tmp13993, i64 1
+  %tmp13995 = getelementptr inbounds float* %tmp13994, i64 1
+  %tmp13996 = getelementptr inbounds float* %tmp13995, i64 1
+  %tmp13997 = getelementptr inbounds float* %tmp13996, i64 1
+  %tmp13998 = getelementptr inbounds float* %tmp13997, i64 1
+  %tmp13999 = getelementptr inbounds float* %tmp13998, i64 1
+  %tmp14000 = getelementptr inbounds float* %tmp13999, i64 1
+  %tmp14001 = getelementptr inbounds float* %tmp14000, i64 1
+  %tmp14002 = getelementptr inbounds float* %tmp14001, i64 1
+  %tmp14003 = getelementptr inbounds float* %tmp14002, i64 1
+  %tmp14004 = getelementptr inbounds float* %tmp14003, i64 1
+  %tmp14005 = getelementptr inbounds float* %tmp14004, i64 1
+  %tmp14006 = getelementptr inbounds float* %tmp14005, i64 1
+  %tmp14007 = getelementptr inbounds float* %tmp14006, i64 1
+  %tmp14008 = getelementptr inbounds float* %tmp14007, i64 1
+  %tmp14009 = getelementptr inbounds float* %tmp14008, i64 1
+  %tmp14010 = getelementptr inbounds float* %tmp14009, i64 1
+  %tmp14011 = getelementptr inbounds float* %tmp14010, i64 1
+  %tmp14012 = getelementptr inbounds float* %tmp14011, i64 1
+  %tmp14013 = getelementptr inbounds float* %tmp14012, i64 1
+  %tmp14014 = getelementptr inbounds float* %tmp14013, i64 1
+  %tmp14015 = getelementptr inbounds float* %tmp14014, i64 1
+  %tmp14016 = getelementptr inbounds float* %tmp14015, i64 1
+  %tmp14017 = getelementptr inbounds float* %tmp14016, i64 1
+  %tmp14018 = getelementptr inbounds float* %tmp14017, i64 1
+  %tmp14019 = getelementptr inbounds float* %tmp14018, i64 1
+  %tmp14020 = getelementptr inbounds float* %tmp14019, i64 1
+  %tmp14021 = getelementptr inbounds float* %tmp14020, i64 1
+  %tmp14022 = getelementptr inbounds float* %tmp14021, i64 1
+  %tmp14023 = getelementptr inbounds float* %tmp14022, i64 1
+  %tmp14024 = getelementptr inbounds float* %tmp14023, i64 1
+  %tmp14025 = getelementptr inbounds float* %tmp14024, i64 1
+  %tmp14026 = getelementptr inbounds float* %tmp14025, i64 1
+  %tmp14027 = getelementptr inbounds float* %tmp14026, i64 1
+  %tmp14028 = getelementptr inbounds float* %tmp14027, i64 1
+  %tmp14029 = getelementptr inbounds float* %tmp14028, i64 1
+  %tmp14030 = getelementptr inbounds float* %tmp14029, i64 1
+  %tmp14031 = getelementptr inbounds float* %tmp14030, i64 1
+  %tmp14032 = getelementptr inbounds float* %tmp14031, i64 1
+  %tmp14033 = getelementptr inbounds float* %tmp14032, i64 1
+  %tmp14034 = getelementptr inbounds float* %tmp14033, i64 1
+  %tmp14035 = getelementptr inbounds float* %tmp14034, i64 1
+  %tmp14036 = getelementptr inbounds float* %tmp14035, i64 1
+  %tmp14037 = getelementptr inbounds float* %tmp14036, i64 1
+  %tmp14038 = getelementptr inbounds float* %tmp14037, i64 1
+  %tmp14039 = getelementptr inbounds float* %tmp14038, i64 1
+  %tmp14040 = getelementptr inbounds float* %tmp14039, i64 1
+  %tmp14041 = getelementptr inbounds float* %tmp14040, i64 1
+  %tmp14042 = getelementptr inbounds float* %tmp14041, i64 1
+  %tmp14043 = getelementptr inbounds float* %tmp14042, i64 1
+  %tmp14044 = getelementptr inbounds float* %tmp14043, i64 1
+  %tmp14045 = getelementptr inbounds float* %tmp14044, i64 1
+  %tmp14046 = getelementptr inbounds float* %tmp14045, i64 1
+  %tmp14047 = getelementptr inbounds float* %tmp14046, i64 1
+  %tmp14048 = getelementptr inbounds float* %tmp14047, i64 1
+  %tmp14049 = getelementptr inbounds float* %tmp14048, i64 1
+  %tmp14050 = getelementptr inbounds float* %tmp14049, i64 1
+  %tmp14051 = getelementptr inbounds float* %tmp14050, i64 1
+  %tmp14052 = getelementptr inbounds float* %tmp14051, i64 1
+  %tmp14053 = getelementptr inbounds float* %tmp14052, i64 1
+  %tmp14054 = getelementptr inbounds float* %tmp14053, i64 1
+  %tmp14055 = getelementptr inbounds float* %tmp14054, i64 1
+  %tmp14056 = getelementptr inbounds float* %tmp14055, i64 1
+  %tmp14057 = getelementptr inbounds float* %tmp14056, i64 1
+  %tmp14058 = getelementptr inbounds float* %tmp14057, i64 1
+  %tmp14059 = getelementptr inbounds float* %tmp14058, i64 1
+  %tmp14060 = getelementptr inbounds float* %tmp14059, i64 1
+  %tmp14061 = getelementptr inbounds float* %tmp14060, i64 1
+  %tmp14062 = getelementptr inbounds float* %tmp14061, i64 1
+  %tmp14063 = getelementptr inbounds float* %tmp14062, i64 1
+  %tmp14064 = getelementptr inbounds float* %tmp14063, i64 1
+  %tmp14065 = getelementptr inbounds float* %tmp14064, i64 1
+  %tmp14066 = getelementptr inbounds float* %tmp14065, i64 1
+  %tmp14067 = getelementptr inbounds float* %tmp14066, i64 1
+  %tmp14068 = getelementptr inbounds float* %tmp14067, i64 1
+  %tmp14069 = getelementptr inbounds float* %tmp14068, i64 1
+  %tmp14070 = getelementptr inbounds float* %tmp14069, i64 1
+  %tmp14071 = getelementptr inbounds float* %tmp14070, i64 1
+  %tmp14072 = getelementptr inbounds float* %tmp14071, i64 1
+  %tmp14073 = getelementptr inbounds float* %tmp14072, i64 1
+  %tmp14074 = getelementptr inbounds float* %tmp14073, i64 1
+  %tmp14075 = getelementptr inbounds float* %tmp14074, i64 1
+  %tmp14076 = getelementptr inbounds float* %tmp14075, i64 1
+  %tmp14077 = getelementptr inbounds float* %tmp14076, i64 1
+  %tmp14078 = getelementptr inbounds float* %tmp14077, i64 1
+  %tmp14079 = getelementptr inbounds float* %tmp14078, i64 1
+  %tmp14080 = getelementptr inbounds float* %tmp14079, i64 1
+  %tmp14081 = getelementptr inbounds float* %tmp14080, i64 1
+  %tmp14082 = getelementptr inbounds float* %tmp14081, i64 1
+  %tmp14083 = getelementptr inbounds float* %tmp14082, i64 1
+  %tmp14084 = getelementptr inbounds float* %tmp14083, i64 1
+  %tmp14085 = getelementptr inbounds float* %tmp14084, i64 1
+  %tmp14086 = getelementptr inbounds float* %tmp14085, i64 1
+  %tmp14087 = getelementptr inbounds float* %tmp14086, i64 1
+  %tmp14088 = getelementptr inbounds float* %tmp14087, i64 1
+  %tmp14089 = getelementptr inbounds float* %tmp14088, i64 1
+  %tmp14090 = getelementptr inbounds float* %tmp14089, i64 1
+  %tmp14091 = getelementptr inbounds float* %tmp14090, i64 1
+  %tmp14092 = getelementptr inbounds float* %tmp14091, i64 1
+  %tmp14093 = getelementptr inbounds float* %tmp14092, i64 1
+  %tmp14094 = getelementptr inbounds float* %tmp14093, i64 1
+  %tmp14095 = getelementptr inbounds float* %tmp14094, i64 1
+  %tmp14096 = getelementptr inbounds float* %tmp14095, i64 1
+  %tmp14097 = getelementptr inbounds float* %tmp14096, i64 1
+  %tmp14098 = getelementptr inbounds float* %tmp14097, i64 1
+  %tmp14099 = getelementptr inbounds float* %tmp14098, i64 1
+  %tmp14100 = getelementptr inbounds float* %tmp14099, i64 1
+  %tmp14101 = getelementptr inbounds float* %tmp14100, i64 1
+  %tmp14102 = getelementptr inbounds float* %tmp14101, i64 1
+  %tmp14103 = getelementptr inbounds float* %tmp14102, i64 1
+  %tmp14104 = getelementptr inbounds float* %tmp14103, i64 1
+  %tmp14105 = getelementptr inbounds float* %tmp14104, i64 1
+  %tmp14106 = getelementptr inbounds float* %tmp14105, i64 1
+  %tmp14107 = getelementptr inbounds float* %tmp14106, i64 1
+  %tmp14108 = getelementptr inbounds float* %tmp14107, i64 1
+  %tmp14109 = getelementptr inbounds float* %tmp14108, i64 1
+  %tmp14110 = getelementptr inbounds float* %tmp14109, i64 1
+  %tmp14111 = getelementptr inbounds float* %tmp14110, i64 1
+  %tmp14112 = getelementptr inbounds float* %tmp14111, i64 1
+  %tmp14113 = getelementptr inbounds float* %tmp14112, i64 1
+  %tmp14114 = getelementptr inbounds float* %tmp14113, i64 1
+  %tmp14115 = getelementptr inbounds float* %tmp14114, i64 1
+  %tmp14116 = getelementptr inbounds float* %tmp14115, i64 1
+  %tmp14117 = getelementptr inbounds float* %tmp14116, i64 1
+  %tmp14118 = getelementptr inbounds float* %tmp14117, i64 1
+  %tmp14119 = getelementptr inbounds float* %tmp14118, i64 1
+  %tmp14120 = getelementptr inbounds float* %tmp14119, i64 1
+  %tmp14121 = getelementptr inbounds float* %tmp14120, i64 1
+  %tmp14122 = getelementptr inbounds float* %tmp14121, i64 1
+  %tmp14123 = getelementptr inbounds float* %tmp14122, i64 1
+  %tmp14124 = getelementptr inbounds float* %tmp14123, i64 1
+  %tmp14125 = getelementptr inbounds float* %tmp14124, i64 1
+  %tmp14126 = getelementptr inbounds float* %tmp14125, i64 1
+  %tmp14127 = getelementptr inbounds float* %tmp14126, i64 1
+  %tmp14128 = getelementptr inbounds float* %tmp14127, i64 1
+  %tmp14129 = getelementptr inbounds float* %tmp14128, i64 1
+  %tmp14130 = getelementptr inbounds float* %tmp14129, i64 1
+  %tmp14131 = getelementptr inbounds float* %tmp14130, i64 1
+  %tmp14132 = getelementptr inbounds float* %tmp14131, i64 1
+  %tmp14133 = getelementptr inbounds float* %tmp14132, i64 1
+  %tmp14134 = getelementptr inbounds float* %tmp14133, i64 1
+  %tmp14135 = getelementptr inbounds float* %tmp14134, i64 1
+  %tmp14136 = getelementptr inbounds float* %tmp14135, i64 1
+  %tmp14137 = getelementptr inbounds float* %tmp14136, i64 1
+  %tmp14138 = getelementptr inbounds float* %tmp14137, i64 1
+  %tmp14139 = getelementptr inbounds float* %tmp14138, i64 1
+  %tmp14140 = getelementptr inbounds float* %tmp14139, i64 1
+  %tmp14141 = getelementptr inbounds float* %tmp14140, i64 1
+  %tmp14142 = getelementptr inbounds float* %tmp14141, i64 1
+  %tmp14143 = getelementptr inbounds float* %tmp14142, i64 1
+  %tmp14144 = getelementptr inbounds float* %tmp14143, i64 1
+  %tmp14145 = getelementptr inbounds float* %tmp14144, i64 1
+  %tmp14146 = getelementptr inbounds float* %tmp14145, i64 1
+  %tmp14147 = getelementptr inbounds float* %tmp14146, i64 1
+  %tmp14148 = getelementptr inbounds float* %tmp14147, i64 1
+  %tmp14149 = getelementptr inbounds float* %tmp14148, i64 1
+  %tmp14150 = getelementptr inbounds float* %tmp14149, i64 1
+  %tmp14151 = getelementptr inbounds float* %tmp14150, i64 1
+  %tmp14152 = getelementptr inbounds float* %tmp14151, i64 1
+  %tmp14153 = getelementptr inbounds float* %tmp14152, i64 1
+  %tmp14154 = getelementptr inbounds float* %tmp14153, i64 1
+  %tmp14155 = getelementptr inbounds float* %tmp14154, i64 1
+  %tmp14156 = getelementptr inbounds float* %tmp14155, i64 1
+  %tmp14157 = getelementptr inbounds float* %tmp14156, i64 1
+  %tmp14158 = getelementptr inbounds float* %tmp14157, i64 1
+  %tmp14159 = getelementptr inbounds float* %tmp14158, i64 1
+  %tmp14160 = getelementptr inbounds float* %tmp14159, i64 1
+  %tmp14161 = getelementptr inbounds float* %tmp14160, i64 1
+  %tmp14162 = getelementptr inbounds float* %tmp14161, i64 1
+  %tmp14163 = getelementptr inbounds float* %tmp14162, i64 1
+  %tmp14164 = getelementptr inbounds float* %tmp14163, i64 1
+  %tmp14165 = getelementptr inbounds float* %tmp14164, i64 1
+  %tmp14166 = getelementptr inbounds float* %tmp14165, i64 1
+  %tmp14167 = getelementptr inbounds float* %tmp14166, i64 1
+  %tmp14168 = getelementptr inbounds float* %tmp14167, i64 1
+  %tmp14169 = getelementptr inbounds float* %tmp14168, i64 1
+  %tmp14170 = getelementptr inbounds float* %tmp14169, i64 1
+  %tmp14171 = getelementptr inbounds float* %tmp14170, i64 1
+  %tmp14172 = getelementptr inbounds float* %tmp14171, i64 1
+  %tmp14173 = getelementptr inbounds float* %tmp14172, i64 1
+  %tmp14174 = getelementptr inbounds float* %tmp14173, i64 1
+  %tmp14175 = getelementptr inbounds float* %tmp14174, i64 1
+  %tmp14176 = getelementptr inbounds float* %tmp14175, i64 1
+  %tmp14177 = getelementptr inbounds float* %tmp14176, i64 1
+  %tmp14178 = getelementptr inbounds float* %tmp14177, i64 1
+  %tmp14179 = getelementptr inbounds float* %tmp14178, i64 1
+  %tmp14180 = getelementptr inbounds float* %tmp14179, i64 1
+  %tmp14181 = getelementptr inbounds float* %tmp14180, i64 1
+  %tmp14182 = getelementptr inbounds float* %tmp14181, i64 1
+  %tmp14183 = getelementptr inbounds float* %tmp14182, i64 1
+  %tmp14184 = getelementptr inbounds float* %tmp14183, i64 1
+  %tmp14185 = getelementptr inbounds float* %tmp14184, i64 1
+  %tmp14186 = getelementptr inbounds float* %tmp14185, i64 1
+  %tmp14187 = getelementptr inbounds float* %tmp14186, i64 1
+  %tmp14188 = getelementptr inbounds float* %tmp14187, i64 1
+  %tmp14189 = getelementptr inbounds float* %tmp14188, i64 1
+  %tmp14190 = getelementptr inbounds float* %tmp14189, i64 1
+  %tmp14191 = getelementptr inbounds float* %tmp14190, i64 1
+  %tmp14192 = getelementptr inbounds float* %tmp14191, i64 1
+  %tmp14193 = getelementptr inbounds float* %tmp14192, i64 1
+  %tmp14194 = getelementptr inbounds float* %tmp14193, i64 1
+  %tmp14195 = getelementptr inbounds float* %tmp14194, i64 1
+  %tmp14196 = getelementptr inbounds float* %tmp14195, i64 1
+  %tmp14197 = getelementptr inbounds float* %tmp14196, i64 1
+  %tmp14198 = getelementptr inbounds float* %tmp14197, i64 1
+  %tmp14199 = getelementptr inbounds float* %tmp14198, i64 1
+  %tmp14200 = getelementptr inbounds float* %tmp14199, i64 1
+  %tmp14201 = getelementptr inbounds float* %tmp14200, i64 1
+  %tmp14202 = getelementptr inbounds float* %tmp14201, i64 1
+  %tmp14203 = getelementptr inbounds float* %tmp14202, i64 1
+  %tmp14204 = getelementptr inbounds float* %tmp14203, i64 1
+  %tmp14205 = getelementptr inbounds float* %tmp14204, i64 1
+  %tmp14206 = getelementptr inbounds float* %tmp14205, i64 1
+  %tmp14207 = getelementptr inbounds float* %tmp14206, i64 1
+  %tmp14208 = getelementptr inbounds float* %tmp14207, i64 1
+  %tmp14209 = getelementptr inbounds float* %tmp14208, i64 1
+  %tmp14210 = getelementptr inbounds float* %tmp14209, i64 1
+  %tmp14211 = getelementptr inbounds float* %tmp14210, i64 1
+  %tmp14212 = getelementptr inbounds float* %tmp14211, i64 1
+  %tmp14213 = getelementptr inbounds float* %tmp14212, i64 1
+  %tmp14214 = getelementptr inbounds float* %tmp14213, i64 1
+  %tmp14215 = getelementptr inbounds float* %tmp14214, i64 1
+  %tmp14216 = getelementptr inbounds float* %tmp14215, i64 1
+  %tmp14217 = getelementptr inbounds float* %tmp14216, i64 1
+  %tmp14218 = getelementptr inbounds float* %tmp14217, i64 1
+  %tmp14219 = getelementptr inbounds float* %tmp14218, i64 1
+  %tmp14220 = getelementptr inbounds float* %tmp14219, i64 1
+  %tmp14221 = getelementptr inbounds float* %tmp14220, i64 1
+  %tmp14222 = getelementptr inbounds float* %tmp14221, i64 1
+  %tmp14223 = getelementptr inbounds float* %tmp14222, i64 1
+  %tmp14224 = getelementptr inbounds float* %tmp14223, i64 1
+  %tmp14225 = getelementptr inbounds float* %tmp14224, i64 1
+  %tmp14226 = getelementptr inbounds float* %tmp14225, i64 1
+  %tmp14227 = getelementptr inbounds float* %tmp14226, i64 1
+  %tmp14228 = getelementptr inbounds float* %tmp14227, i64 1
+  %tmp14229 = getelementptr inbounds float* %tmp14228, i64 1
+  %tmp14230 = getelementptr inbounds float* %tmp14229, i64 1
+  %tmp14231 = getelementptr inbounds float* %tmp14230, i64 1
+  %tmp14232 = getelementptr inbounds float* %tmp14231, i64 1
+  %tmp14233 = getelementptr inbounds float* %tmp14232, i64 1
+  %tmp14234 = getelementptr inbounds float* %tmp14233, i64 1
+  %tmp14235 = getelementptr inbounds float* %tmp14234, i64 1
+  %tmp14236 = getelementptr inbounds float* %tmp14235, i64 1
+  %tmp14237 = getelementptr inbounds float* %tmp14236, i64 1
+  %tmp14238 = getelementptr inbounds float* %tmp14237, i64 1
+  %tmp14239 = getelementptr inbounds float* %tmp14238, i64 1
+  %tmp14240 = getelementptr inbounds float* %tmp14239, i64 1
+  %tmp14241 = getelementptr inbounds float* %tmp14240, i64 1
+  %tmp14242 = getelementptr inbounds float* %tmp14241, i64 1
+  %tmp14243 = getelementptr inbounds float* %tmp14242, i64 1
+  %tmp14244 = getelementptr inbounds float* %tmp14243, i64 1
+  %tmp14245 = getelementptr inbounds float* %tmp14244, i64 1
+  %tmp14246 = getelementptr inbounds float* %tmp14245, i64 1
+  %tmp14247 = getelementptr inbounds float* %tmp14246, i64 1
+  %tmp14248 = getelementptr inbounds float* %tmp14247, i64 1
+  %tmp14249 = getelementptr inbounds float* %tmp14248, i64 1
+  %tmp14250 = getelementptr inbounds float* %tmp14249, i64 1
+  %tmp14251 = getelementptr inbounds float* %tmp14250, i64 1
+  %tmp14252 = getelementptr inbounds float* %tmp14251, i64 1
+  %tmp14253 = getelementptr inbounds float* %tmp14252, i64 1
+  %tmp14254 = getelementptr inbounds float* %tmp14253, i64 1
+  %tmp14255 = getelementptr inbounds float* %tmp14254, i64 1
+  %tmp14256 = getelementptr inbounds float* %tmp14255, i64 1
+  %tmp14257 = getelementptr inbounds float* %tmp14256, i64 1
+  %tmp14258 = getelementptr inbounds float* %tmp14257, i64 1
+  %tmp14259 = getelementptr inbounds float* %tmp14258, i64 1
+  %tmp14260 = getelementptr inbounds float* %tmp14259, i64 1
+  %tmp14261 = getelementptr inbounds float* %tmp14260, i64 1
+  %tmp14262 = getelementptr inbounds float* %tmp14261, i64 1
+  %tmp14263 = getelementptr inbounds float* %tmp14262, i64 1
+  %tmp14264 = getelementptr inbounds float* %tmp14263, i64 1
+  %tmp14265 = getelementptr inbounds float* %tmp14264, i64 1
+  %tmp14266 = getelementptr inbounds float* %tmp14265, i64 1
+  %tmp14267 = getelementptr inbounds float* %tmp14266, i64 1
+  %tmp14268 = getelementptr inbounds float* %tmp14267, i64 1
+  %tmp14269 = getelementptr inbounds float* %tmp14268, i64 1
+  %tmp14270 = getelementptr inbounds float* %tmp14269, i64 1
+  %tmp14271 = getelementptr inbounds float* %tmp14270, i64 1
+  %tmp14272 = getelementptr inbounds float* %tmp14271, i64 1
+  %tmp14273 = getelementptr inbounds float* %tmp14272, i64 1
+  %tmp14274 = getelementptr inbounds float* %tmp14273, i64 1
+  %tmp14275 = getelementptr inbounds float* %tmp14274, i64 1
+  %tmp14276 = getelementptr inbounds float* %tmp14275, i64 1
+  %tmp14277 = getelementptr inbounds float* %tmp14276, i64 1
+  %tmp14278 = getelementptr inbounds float* %tmp14277, i64 1
+  %tmp14279 = getelementptr inbounds float* %tmp14278, i64 1
+  %tmp14280 = getelementptr inbounds float* %tmp14279, i64 1
+  %tmp14281 = getelementptr inbounds float* %tmp14280, i64 1
+  %tmp14282 = getelementptr inbounds float* %tmp14281, i64 1
+  %tmp14283 = getelementptr inbounds float* %tmp14282, i64 1
+  %tmp14284 = getelementptr inbounds float* %tmp14283, i64 1
+  %tmp14285 = getelementptr inbounds float* %tmp14284, i64 1
+  %tmp14286 = getelementptr inbounds float* %tmp14285, i64 1
+  %tmp14287 = getelementptr inbounds float* %tmp14286, i64 1
+  %tmp14288 = getelementptr inbounds float* %tmp14287, i64 1
+  %tmp14289 = getelementptr inbounds float* %tmp14288, i64 1
+  %tmp14290 = getelementptr inbounds float* %tmp14289, i64 1
+  %tmp14291 = getelementptr inbounds float* %tmp14290, i64 1
+  %tmp14292 = getelementptr inbounds float* %tmp14291, i64 1
+  %tmp14293 = getelementptr inbounds float* %tmp14292, i64 1
+  %tmp14294 = getelementptr inbounds float* %tmp14293, i64 1
+  %tmp14295 = getelementptr inbounds float* %tmp14294, i64 1
+  %tmp14296 = getelementptr inbounds float* %tmp14295, i64 1
+  %tmp14297 = getelementptr inbounds float* %tmp14296, i64 1
+  %tmp14298 = getelementptr inbounds float* %tmp14297, i64 1
+  %tmp14299 = getelementptr inbounds float* %tmp14298, i64 1
+  %tmp14300 = getelementptr inbounds float* %tmp14299, i64 1
+  %tmp14301 = getelementptr inbounds float* %tmp14300, i64 1
+  %tmp14302 = getelementptr inbounds float* %tmp14301, i64 1
+  %tmp14303 = getelementptr inbounds float* %tmp14302, i64 1
+  %tmp14304 = getelementptr inbounds float* %tmp14303, i64 1
+  %tmp14305 = getelementptr inbounds float* %tmp14304, i64 1
+  %tmp14306 = getelementptr inbounds float* %tmp14305, i64 1
+  %tmp14307 = getelementptr inbounds float* %tmp14306, i64 1
+  %tmp14308 = getelementptr inbounds float* %tmp14307, i64 1
+  %tmp14309 = getelementptr inbounds float* %tmp14308, i64 1
+  %tmp14310 = getelementptr inbounds float* %tmp14309, i64 1
+  %tmp14311 = getelementptr inbounds float* %tmp14310, i64 1
+  %tmp14312 = getelementptr inbounds float* %tmp14311, i64 1
+  %tmp14313 = getelementptr inbounds float* %tmp14312, i64 1
+  %tmp14314 = getelementptr inbounds float* %tmp14313, i64 1
+  %tmp14315 = getelementptr inbounds float* %tmp14314, i64 1
+  %tmp14316 = getelementptr inbounds float* %tmp14315, i64 1
+  %tmp14317 = getelementptr inbounds float* %tmp14316, i64 1
+  %tmp14318 = getelementptr inbounds float* %tmp14317, i64 1
+  %tmp14319 = getelementptr inbounds float* %tmp14318, i64 1
+  %tmp14320 = getelementptr inbounds float* %tmp14319, i64 1
+  %tmp14321 = getelementptr inbounds float* %tmp14320, i64 1
+  %tmp14322 = getelementptr inbounds float* %tmp14321, i64 1
+  %tmp14323 = getelementptr inbounds float* %tmp14322, i64 1
+  %tmp14324 = getelementptr inbounds float* %tmp14323, i64 1
+  %tmp14325 = getelementptr inbounds float* %tmp14324, i64 1
+  %tmp14326 = getelementptr inbounds float* %tmp14325, i64 1
+  %tmp14327 = getelementptr inbounds float* %tmp14326, i64 1
+  %tmp14328 = getelementptr inbounds float* %tmp14327, i64 1
+  %tmp14329 = getelementptr inbounds float* %tmp14328, i64 1
+  %tmp14330 = getelementptr inbounds float* %tmp14329, i64 1
+  %tmp14331 = getelementptr inbounds float* %tmp14330, i64 1
+  %tmp14332 = getelementptr inbounds float* %tmp14331, i64 1
+  %tmp14333 = getelementptr inbounds float* %tmp14332, i64 1
+  %tmp14334 = getelementptr inbounds float* %tmp14333, i64 1
+  %tmp14335 = getelementptr inbounds float* %tmp14334, i64 1
+  %tmp14336 = getelementptr inbounds float* %tmp14335, i64 1
+  %tmp14337 = getelementptr inbounds float* %tmp14336, i64 1
+  %tmp14338 = getelementptr inbounds float* %tmp14337, i64 1
+  %tmp14339 = getelementptr inbounds float* %tmp14338, i64 1
+  %tmp14340 = getelementptr inbounds float* %tmp14339, i64 1
+  %tmp14341 = getelementptr inbounds float* %tmp14340, i64 1
+  %tmp14342 = getelementptr inbounds float* %tmp14341, i64 1
+  %tmp14343 = getelementptr inbounds float* %tmp14342, i64 1
+  %tmp14344 = getelementptr inbounds float* %tmp14343, i64 1
+  %tmp14345 = getelementptr inbounds float* %tmp14344, i64 1
+  %tmp14346 = getelementptr inbounds float* %tmp14345, i64 1
+  %tmp14347 = getelementptr inbounds float* %tmp14346, i64 1
+  %tmp14348 = getelementptr inbounds float* %tmp14347, i64 1
+  %tmp14349 = getelementptr inbounds float* %tmp14348, i64 1
+  %tmp14350 = getelementptr inbounds float* %tmp14349, i64 1
+  %tmp14351 = getelementptr inbounds float* %tmp14350, i64 1
+  %tmp14352 = getelementptr inbounds float* %tmp14351, i64 1
+  %tmp14353 = getelementptr inbounds float* %tmp14352, i64 1
+  %tmp14354 = getelementptr inbounds float* %tmp14353, i64 1
+  %tmp14355 = getelementptr inbounds float* %tmp14354, i64 1
+  %tmp14356 = getelementptr inbounds float* %tmp14355, i64 1
+  %tmp14357 = getelementptr inbounds float* %tmp14356, i64 1
+  %tmp14358 = getelementptr inbounds float* %tmp14357, i64 1
+  %tmp14359 = getelementptr inbounds float* %tmp14358, i64 1
+  %tmp14360 = getelementptr inbounds float* %tmp14359, i64 1
+  %tmp14361 = getelementptr inbounds float* %tmp14360, i64 1
+  %tmp14362 = getelementptr inbounds float* %tmp14361, i64 1
+  %tmp14363 = getelementptr inbounds float* %tmp14362, i64 1
+  %tmp14364 = getelementptr inbounds float* %tmp14363, i64 1
+  %tmp14365 = getelementptr inbounds float* %tmp14364, i64 1
+  %tmp14366 = getelementptr inbounds float* %tmp14365, i64 1
+  %tmp14367 = getelementptr inbounds float* %tmp14366, i64 1
+  %tmp14368 = getelementptr inbounds float* %tmp14367, i64 1
+  %tmp14369 = getelementptr inbounds float* %tmp14368, i64 1
+  %tmp14370 = getelementptr inbounds float* %tmp14369, i64 1
+  %tmp14371 = getelementptr inbounds float* %tmp14370, i64 1
+  %tmp14372 = getelementptr inbounds float* %tmp14371, i64 1
+  %tmp14373 = getelementptr inbounds float* %tmp14372, i64 1
+  %tmp14374 = getelementptr inbounds float* %tmp14373, i64 1
+  %tmp14375 = getelementptr inbounds float* %tmp14374, i64 1
+  %tmp14376 = getelementptr inbounds float* %tmp14375, i64 1
+  %tmp14377 = getelementptr inbounds float* %tmp14376, i64 1
+  %tmp14378 = getelementptr inbounds float* %tmp14377, i64 1
+  %tmp14379 = getelementptr inbounds float* %tmp14378, i64 1
+  %tmp14380 = getelementptr inbounds float* %tmp14379, i64 1
+  %tmp14381 = getelementptr inbounds float* %tmp14380, i64 1
+  %tmp14382 = getelementptr inbounds float* %tmp14381, i64 1
+  %tmp14383 = getelementptr inbounds float* %tmp14382, i64 1
+  %tmp14384 = getelementptr inbounds float* %tmp14383, i64 1
+  %tmp14385 = getelementptr inbounds float* %tmp14384, i64 1
+  %tmp14386 = getelementptr inbounds float* %tmp14385, i64 1
+  %tmp14387 = getelementptr inbounds float* %tmp14386, i64 1
+  %tmp14388 = getelementptr inbounds float* %tmp14387, i64 1
+  %tmp14389 = getelementptr inbounds float* %tmp14388, i64 1
+  %tmp14390 = getelementptr inbounds float* %tmp14389, i64 1
+  %tmp14391 = getelementptr inbounds float* %tmp14390, i64 1
+  %tmp14392 = getelementptr inbounds float* %tmp14391, i64 1
+  %tmp14393 = getelementptr inbounds float* %tmp14392, i64 1
+  %tmp14394 = getelementptr inbounds float* %tmp14393, i64 1
+  %tmp14395 = getelementptr inbounds float* %tmp14394, i64 1
+  %tmp14396 = getelementptr inbounds float* %tmp14395, i64 1
+  %tmp14397 = getelementptr inbounds float* %tmp14396, i64 1
+  %tmp14398 = getelementptr inbounds float* %tmp14397, i64 1
+  %tmp14399 = getelementptr inbounds float* %tmp14398, i64 1
+  %tmp14400 = getelementptr inbounds float* %tmp14399, i64 1
+  %tmp14401 = getelementptr inbounds float* %tmp14400, i64 1
+  %tmp14402 = getelementptr inbounds float* %tmp14401, i64 1
+  %tmp14403 = getelementptr inbounds float* %tmp14402, i64 1
+  %tmp14404 = getelementptr inbounds float* %tmp14403, i64 1
+  %tmp14405 = getelementptr inbounds float* %tmp14404, i64 1
+  %tmp14406 = getelementptr inbounds float* %tmp14405, i64 1
+  %tmp14407 = getelementptr inbounds float* %tmp14406, i64 1
+  %tmp14408 = getelementptr inbounds float* %tmp14407, i64 1
+  %tmp14409 = getelementptr inbounds float* %tmp14408, i64 1
+  %tmp14410 = getelementptr inbounds float* %tmp14409, i64 1
+  %tmp14411 = getelementptr inbounds float* %tmp14410, i64 1
+  %tmp14412 = getelementptr inbounds float* %tmp14411, i64 1
+  %tmp14413 = getelementptr inbounds float* %tmp14412, i64 1
+  %tmp14414 = getelementptr inbounds float* %tmp14413, i64 1
+  %tmp14415 = getelementptr inbounds float* %tmp14414, i64 1
+  %tmp14416 = getelementptr inbounds float* %tmp14415, i64 1
+  %tmp14417 = getelementptr inbounds float* %tmp14416, i64 1
+  %tmp14418 = getelementptr inbounds float* %tmp14417, i64 1
+  %tmp14419 = getelementptr inbounds float* %tmp14418, i64 1
+  %tmp14420 = getelementptr inbounds float* %tmp14419, i64 1
+  %tmp14421 = getelementptr inbounds float* %tmp14420, i64 1
+  %tmp14422 = getelementptr inbounds float* %tmp14421, i64 1
+  %tmp14423 = getelementptr inbounds float* %tmp14422, i64 1
+  %tmp14424 = getelementptr inbounds float* %tmp14423, i64 1
+  %tmp14425 = getelementptr inbounds float* %tmp14424, i64 1
+  %tmp14426 = getelementptr inbounds float* %tmp14425, i64 1
+  %tmp14427 = getelementptr inbounds float* %tmp14426, i64 1
+  %tmp14428 = getelementptr inbounds float* %tmp14427, i64 1
+  %tmp14429 = getelementptr inbounds float* %tmp14428, i64 1
+  %tmp14430 = getelementptr inbounds float* %tmp14429, i64 1
+  %tmp14431 = getelementptr inbounds float* %tmp14430, i64 1
+  %tmp14432 = getelementptr inbounds float* %tmp14431, i64 1
+  %tmp14433 = getelementptr inbounds float* %tmp14432, i64 1
+  %tmp14434 = getelementptr inbounds float* %tmp14433, i64 1
+  %tmp14435 = getelementptr inbounds float* %tmp14434, i64 1
+  %tmp14436 = getelementptr inbounds float* %tmp14435, i64 1
+  %tmp14437 = getelementptr inbounds float* %tmp14436, i64 1
+  %tmp14438 = getelementptr inbounds float* %tmp14437, i64 1
+  %tmp14439 = getelementptr inbounds float* %tmp14438, i64 1
+  %tmp14440 = getelementptr inbounds float* %tmp14439, i64 1
+  %tmp14441 = getelementptr inbounds float* %tmp14440, i64 1
+  %tmp14442 = getelementptr inbounds float* %tmp14441, i64 1
+  %tmp14443 = getelementptr inbounds float* %tmp14442, i64 1
+  %tmp14444 = getelementptr inbounds float* %tmp14443, i64 1
+  %tmp14445 = getelementptr inbounds float* %tmp14444, i64 1
+  %tmp14446 = getelementptr inbounds float* %tmp14445, i64 1
+  %tmp14447 = getelementptr inbounds float* %tmp14446, i64 1
+  %tmp14448 = getelementptr inbounds float* %tmp14447, i64 1
+  %tmp14449 = getelementptr inbounds float* %tmp14448, i64 1
+  %tmp14450 = getelementptr inbounds float* %tmp14449, i64 1
+  %tmp14451 = getelementptr inbounds float* %tmp14450, i64 1
+  %tmp14452 = getelementptr inbounds float* %tmp14451, i64 1
+  %tmp14453 = getelementptr inbounds float* %tmp14452, i64 1
+  %tmp14454 = getelementptr inbounds float* %tmp14453, i64 1
+  %tmp14455 = getelementptr inbounds float* %tmp14454, i64 1
+  %tmp14456 = getelementptr inbounds float* %tmp14455, i64 1
+  %tmp14457 = getelementptr inbounds float* %tmp14456, i64 1
+  %tmp14458 = getelementptr inbounds float* %tmp14457, i64 1
+  %tmp14459 = getelementptr inbounds float* %tmp14458, i64 1
+  %tmp14460 = getelementptr inbounds float* %tmp14459, i64 1
+  %tmp14461 = getelementptr inbounds float* %tmp14460, i64 1
+  %tmp14462 = getelementptr inbounds float* %tmp14461, i64 1
+  %tmp14463 = getelementptr inbounds float* %tmp14462, i64 1
+  %tmp14464 = getelementptr inbounds float* %tmp14463, i64 1
+  %tmp14465 = getelementptr inbounds float* %tmp14464, i64 1
+  %tmp14466 = getelementptr inbounds float* %tmp14465, i64 1
+  %tmp14467 = getelementptr inbounds float* %tmp14466, i64 1
+  %tmp14468 = getelementptr inbounds float* %tmp14467, i64 1
+  %tmp14469 = getelementptr inbounds float* %tmp14468, i64 1
+  %tmp14470 = getelementptr inbounds float* %tmp14469, i64 1
+  %tmp14471 = getelementptr inbounds float* %tmp14470, i64 1
+  %tmp14472 = getelementptr inbounds float* %tmp14471, i64 1
+  %tmp14473 = getelementptr inbounds float* %tmp14472, i64 1
+  %tmp14474 = getelementptr inbounds float* %tmp14473, i64 1
+  %tmp14475 = getelementptr inbounds float* %tmp14474, i64 1
+  %tmp14476 = getelementptr inbounds float* %tmp14475, i64 1
+  %tmp14477 = getelementptr inbounds float* %tmp14476, i64 1
+  %tmp14478 = getelementptr inbounds float* %tmp14477, i64 1
+  %tmp14479 = getelementptr inbounds float* %tmp14478, i64 1
+  %tmp14480 = getelementptr inbounds float* %tmp14479, i64 1
+  %tmp14481 = getelementptr inbounds float* %tmp14480, i64 1
+  %tmp14482 = getelementptr inbounds float* %tmp14481, i64 1
+  %tmp14483 = getelementptr inbounds float* %tmp14482, i64 1
+  %tmp14484 = getelementptr inbounds float* %tmp14483, i64 1
+  %tmp14485 = getelementptr inbounds float* %tmp14484, i64 1
+  %tmp14486 = getelementptr inbounds float* %tmp14485, i64 1
+  %tmp14487 = getelementptr inbounds float* %tmp14486, i64 1
+  %tmp14488 = getelementptr inbounds float* %tmp14487, i64 1
+  %tmp14489 = getelementptr inbounds float* %tmp14488, i64 1
+  %tmp14490 = getelementptr inbounds float* %tmp14489, i64 1
+  %tmp14491 = getelementptr inbounds float* %tmp14490, i64 1
+  %tmp14492 = getelementptr inbounds float* %tmp14491, i64 1
+  %tmp14493 = getelementptr inbounds float* %tmp14492, i64 1
+  %tmp14494 = getelementptr inbounds float* %tmp14493, i64 1
+  %tmp14495 = getelementptr inbounds float* %tmp14494, i64 1
+  %tmp14496 = getelementptr inbounds float* %tmp14495, i64 1
+  %tmp14497 = getelementptr inbounds float* %tmp14496, i64 1
+  %tmp14498 = getelementptr inbounds float* %tmp14497, i64 1
+  %tmp14499 = getelementptr inbounds float* %tmp14498, i64 1
+  %tmp14500 = getelementptr inbounds float* %tmp14499, i64 1
+  %tmp14501 = getelementptr inbounds float* %tmp14500, i64 1
+  %tmp14502 = getelementptr inbounds float* %tmp14501, i64 1
+  %tmp14503 = getelementptr inbounds float* %tmp14502, i64 1
+  %tmp14504 = getelementptr inbounds float* %tmp14503, i64 1
+  %tmp14505 = getelementptr inbounds float* %tmp14504, i64 1
+  %tmp14506 = getelementptr inbounds float* %tmp14505, i64 1
+  %tmp14507 = getelementptr inbounds float* %tmp14506, i64 1
+  %tmp14508 = getelementptr inbounds float* %tmp14507, i64 1
+  %tmp14509 = getelementptr inbounds float* %tmp14508, i64 1
+  %tmp14510 = getelementptr inbounds float* %tmp14509, i64 1
+  %tmp14511 = getelementptr inbounds float* %tmp14510, i64 1
+  %tmp14512 = getelementptr inbounds float* %tmp14511, i64 1
+  %tmp14513 = getelementptr inbounds float* %tmp14512, i64 1
+  %tmp14514 = getelementptr inbounds float* %tmp14513, i64 1
+  %tmp14515 = getelementptr inbounds float* %tmp14514, i64 1
+  %tmp14516 = getelementptr inbounds float* %tmp14515, i64 1
+  %tmp14517 = getelementptr inbounds float* %tmp14516, i64 1
+  %tmp14518 = getelementptr inbounds float* %tmp14517, i64 1
+  %tmp14519 = getelementptr inbounds float* %tmp14518, i64 1
+  %tmp14520 = getelementptr inbounds float* %tmp14519, i64 1
+  %tmp14521 = getelementptr inbounds float* %tmp14520, i64 1
+  %tmp14522 = getelementptr inbounds float* %tmp14521, i64 1
+  %tmp14523 = getelementptr inbounds float* %tmp14522, i64 1
+  %tmp14524 = getelementptr inbounds float* %tmp14523, i64 1
+  %tmp14525 = getelementptr inbounds float* %tmp14524, i64 1
+  %tmp14526 = getelementptr inbounds float* %tmp14525, i64 1
+  %tmp14527 = getelementptr inbounds float* %tmp14526, i64 1
+  %tmp14528 = getelementptr inbounds float* %tmp14527, i64 1
+  %tmp14529 = getelementptr inbounds float* %tmp14528, i64 1
+  %tmp14530 = getelementptr inbounds float* %tmp14529, i64 1
+  %tmp14531 = getelementptr inbounds float* %tmp14530, i64 1
+  %tmp14532 = getelementptr inbounds float* %tmp14531, i64 1
+  %tmp14533 = getelementptr inbounds float* %tmp14532, i64 1
+  %tmp14534 = getelementptr inbounds float* %tmp14533, i64 1
+  %tmp14535 = getelementptr inbounds float* %tmp14534, i64 1
+  %tmp14536 = getelementptr inbounds float* %tmp14535, i64 1
+  %tmp14537 = getelementptr inbounds float* %tmp14536, i64 1
+  %tmp14538 = getelementptr inbounds float* %tmp14537, i64 1
+  %tmp14539 = getelementptr inbounds float* %tmp14538, i64 1
+  %tmp14540 = getelementptr inbounds float* %tmp14539, i64 1
+  %tmp14541 = getelementptr inbounds float* %tmp14540, i64 1
+  %tmp14542 = getelementptr inbounds float* %tmp14541, i64 1
+  %tmp14543 = getelementptr inbounds float* %tmp14542, i64 1
+  %tmp14544 = getelementptr inbounds float* %tmp14543, i64 1
+  %tmp14545 = getelementptr inbounds float* %tmp14544, i64 1
+  %tmp14546 = getelementptr inbounds float* %tmp14545, i64 1
+  %tmp14547 = getelementptr inbounds float* %tmp14546, i64 1
+  %tmp14548 = getelementptr inbounds float* %tmp14547, i64 1
+  %tmp14549 = getelementptr inbounds float* %tmp14548, i64 1
+  %tmp14550 = getelementptr inbounds float* %tmp14549, i64 1
+  %tmp14551 = getelementptr inbounds float* %tmp14550, i64 1
+  %tmp14552 = getelementptr inbounds float* %tmp14551, i64 1
+  %tmp14553 = getelementptr inbounds float* %tmp14552, i64 1
+  %tmp14554 = getelementptr inbounds float* %tmp14553, i64 1
+  %tmp14555 = getelementptr inbounds float* %tmp14554, i64 1
+  %tmp14556 = getelementptr inbounds float* %tmp14555, i64 1
+  %tmp14557 = getelementptr inbounds float* %tmp14556, i64 1
+  %tmp14558 = getelementptr inbounds float* %tmp14557, i64 1
+  %tmp14559 = getelementptr inbounds float* %tmp14558, i64 1
+  %tmp14560 = getelementptr inbounds float* %tmp14559, i64 1
+  %tmp14561 = getelementptr inbounds float* %tmp14560, i64 1
+  %tmp14562 = getelementptr inbounds float* %tmp14561, i64 1
+  %tmp14563 = getelementptr inbounds float* %tmp14562, i64 1
+  %tmp14564 = getelementptr inbounds float* %tmp14563, i64 1
+  %tmp14565 = getelementptr inbounds float* %tmp14564, i64 1
+  %tmp14566 = getelementptr inbounds float* %tmp14565, i64 1
+  %tmp14567 = getelementptr inbounds float* %tmp14566, i64 1
+  %tmp14568 = getelementptr inbounds float* %tmp14567, i64 1
+  %tmp14569 = getelementptr inbounds float* %tmp14568, i64 1
+  %tmp14570 = getelementptr inbounds float* %tmp14569, i64 1
+  %tmp14571 = getelementptr inbounds float* %tmp14570, i64 1
+  %tmp14572 = getelementptr inbounds float* %tmp14571, i64 1
+  %tmp14573 = getelementptr inbounds float* %tmp14572, i64 1
+  %tmp14574 = getelementptr inbounds float* %tmp14573, i64 1
+  %tmp14575 = getelementptr inbounds float* %tmp14574, i64 1
+  %tmp14576 = getelementptr inbounds float* %tmp14575, i64 1
+  %tmp14577 = getelementptr inbounds float* %tmp14576, i64 1
+  %tmp14578 = getelementptr inbounds float* %tmp14577, i64 1
+  %tmp14579 = getelementptr inbounds float* %tmp14578, i64 1
+  %tmp14580 = getelementptr inbounds float* %tmp14579, i64 1
+  %tmp14581 = getelementptr inbounds float* %tmp14580, i64 1
+  %tmp14582 = getelementptr inbounds float* %tmp14581, i64 1
+  %tmp14583 = getelementptr inbounds float* %tmp14582, i64 1
+  %tmp14584 = getelementptr inbounds float* %tmp14583, i64 1
+  %tmp14585 = getelementptr inbounds float* %tmp14584, i64 1
+  %tmp14586 = getelementptr inbounds float* %tmp14585, i64 1
+  %tmp14587 = getelementptr inbounds float* %tmp14586, i64 1
+  %tmp14588 = getelementptr inbounds float* %tmp14587, i64 1
+  %tmp14589 = getelementptr inbounds float* %tmp14588, i64 1
+  %tmp14590 = getelementptr inbounds float* %tmp14589, i64 1
+  %tmp14591 = getelementptr inbounds float* %tmp14590, i64 1
+  %tmp14592 = getelementptr inbounds float* %tmp14591, i64 1
+  %tmp14593 = getelementptr inbounds float* %tmp14592, i64 1
+  %tmp14594 = getelementptr inbounds float* %tmp14593, i64 1
+  %tmp14595 = getelementptr inbounds float* %tmp14594, i64 1
+  %tmp14596 = getelementptr inbounds float* %tmp14595, i64 1
+  %tmp14597 = getelementptr inbounds float* %tmp14596, i64 1
+  %tmp14598 = getelementptr inbounds float* %tmp14597, i64 1
+  %tmp14599 = getelementptr inbounds float* %tmp14598, i64 1
+  %tmp14600 = getelementptr inbounds float* %tmp14599, i64 1
+  %tmp14601 = getelementptr inbounds float* %tmp14600, i64 1
+  %tmp14602 = getelementptr inbounds float* %tmp14601, i64 1
+  %tmp14603 = getelementptr inbounds float* %tmp14602, i64 1
+  %tmp14604 = getelementptr inbounds float* %tmp14603, i64 1
+  %tmp14605 = getelementptr inbounds float* %tmp14604, i64 1
+  %tmp14606 = getelementptr inbounds float* %tmp14605, i64 1
+  %tmp14607 = getelementptr inbounds float* %tmp14606, i64 1
+  %tmp14608 = getelementptr inbounds float* %tmp14607, i64 1
+  %tmp14609 = getelementptr inbounds float* %tmp14608, i64 1
+  %tmp14610 = getelementptr inbounds float* %tmp14609, i64 1
+  %tmp14611 = getelementptr inbounds float* %tmp14610, i64 1
+  %tmp14612 = getelementptr inbounds float* %tmp14611, i64 1
+  %tmp14613 = getelementptr inbounds float* %tmp14612, i64 1
+  %tmp14614 = getelementptr inbounds float* %tmp14613, i64 1
+  %tmp14615 = getelementptr inbounds float* %tmp14614, i64 1
+  %tmp14616 = getelementptr inbounds float* %tmp14615, i64 1
+  %tmp14617 = getelementptr inbounds float* %tmp14616, i64 1
+  %tmp14618 = getelementptr inbounds float* %tmp14617, i64 1
+  %tmp14619 = getelementptr inbounds float* %tmp14618, i64 1
+  %tmp14620 = getelementptr inbounds float* %tmp14619, i64 1
+  %tmp14621 = getelementptr inbounds float* %tmp14620, i64 1
+  %tmp14622 = getelementptr inbounds float* %tmp14621, i64 1
+  %tmp14623 = getelementptr inbounds float* %tmp14622, i64 1
+  %tmp14624 = getelementptr inbounds float* %tmp14623, i64 1
+  %tmp14625 = getelementptr inbounds float* %tmp14624, i64 1
+  %tmp14626 = getelementptr inbounds float* %tmp14625, i64 1
+  %tmp14627 = getelementptr inbounds float* %tmp14626, i64 1
+  %tmp14628 = getelementptr inbounds float* %tmp14627, i64 1
+  %tmp14629 = getelementptr inbounds float* %tmp14628, i64 1
+  %tmp14630 = getelementptr inbounds float* %tmp14629, i64 1
+  %tmp14631 = getelementptr inbounds float* %tmp14630, i64 1
+  %tmp14632 = getelementptr inbounds float* %tmp14631, i64 1
+  %tmp14633 = getelementptr inbounds float* %tmp14632, i64 1
+  %tmp14634 = getelementptr inbounds float* %tmp14633, i64 1
+  %tmp14635 = getelementptr inbounds float* %tmp14634, i64 1
+  %tmp14636 = getelementptr inbounds float* %tmp14635, i64 1
+  %tmp14637 = getelementptr inbounds float* %tmp14636, i64 1
+  %tmp14638 = getelementptr inbounds float* %tmp14637, i64 1
+  %tmp14639 = getelementptr inbounds float* %tmp14638, i64 1
+  %tmp14640 = getelementptr inbounds float* %tmp14639, i64 1
+  %tmp14641 = getelementptr inbounds float* %tmp14640, i64 1
+  %tmp14642 = getelementptr inbounds float* %tmp14641, i64 1
+  %tmp14643 = getelementptr inbounds float* %tmp14642, i64 1
+  %tmp14644 = getelementptr inbounds float* %tmp14643, i64 1
+  %tmp14645 = getelementptr inbounds float* %tmp14644, i64 1
+  %tmp14646 = getelementptr inbounds float* %tmp14645, i64 1
+  %tmp14647 = getelementptr inbounds float* %tmp14646, i64 1
+  %tmp14648 = getelementptr inbounds float* %tmp14647, i64 1
+  %tmp14649 = getelementptr inbounds float* %tmp14648, i64 1
+  %tmp14650 = getelementptr inbounds float* %tmp14649, i64 1
+  %tmp14651 = getelementptr inbounds float* %tmp14650, i64 1
+  %tmp14652 = getelementptr inbounds float* %tmp14651, i64 1
+  %tmp14653 = getelementptr inbounds float* %tmp14652, i64 1
+  %tmp14654 = getelementptr inbounds float* %tmp14653, i64 1
+  %tmp14655 = getelementptr inbounds float* %tmp14654, i64 1
+  %tmp14656 = getelementptr inbounds float* %tmp14655, i64 1
+  %tmp14657 = getelementptr inbounds float* %tmp14656, i64 1
+  %tmp14658 = getelementptr inbounds float* %tmp14657, i64 1
+  %tmp14659 = getelementptr inbounds float* %tmp14658, i64 1
+  %tmp14660 = getelementptr inbounds float* %tmp14659, i64 1
+  %tmp14661 = getelementptr inbounds float* %tmp14660, i64 1
+  %tmp14662 = getelementptr inbounds float* %tmp14661, i64 1
+  %tmp14663 = getelementptr inbounds float* %tmp14662, i64 1
+  %tmp14664 = getelementptr inbounds float* %tmp14663, i64 1
+  %tmp14665 = getelementptr inbounds float* %tmp14664, i64 1
+  %tmp14666 = getelementptr inbounds float* %tmp14665, i64 1
+  %tmp14667 = getelementptr inbounds float* %tmp14666, i64 1
+  %tmp14668 = getelementptr inbounds float* %tmp14667, i64 1
+  %tmp14669 = getelementptr inbounds float* %tmp14668, i64 1
+  %tmp14670 = getelementptr inbounds float* %tmp14669, i64 1
+  %tmp14671 = getelementptr inbounds float* %tmp14670, i64 1
+  %tmp14672 = getelementptr inbounds float* %tmp14671, i64 1
+  %tmp14673 = getelementptr inbounds float* %tmp14672, i64 1
+  %tmp14674 = getelementptr inbounds float* %tmp14673, i64 1
+  %tmp14675 = getelementptr inbounds float* %tmp14674, i64 1
+  %tmp14676 = getelementptr inbounds float* %tmp14675, i64 1
+  %tmp14677 = getelementptr inbounds float* %tmp14676, i64 1
+  %tmp14678 = getelementptr inbounds float* %tmp14677, i64 1
+  %tmp14679 = getelementptr inbounds float* %tmp14678, i64 1
+  %tmp14680 = getelementptr inbounds float* %tmp14679, i64 1
+  %tmp14681 = getelementptr inbounds float* %tmp14680, i64 1
+  %tmp14682 = getelementptr inbounds float* %tmp14681, i64 1
+  %tmp14683 = getelementptr inbounds float* %tmp14682, i64 1
+  %tmp14684 = getelementptr inbounds float* %tmp14683, i64 1
+  %tmp14685 = getelementptr inbounds float* %tmp14684, i64 1
+  %tmp14686 = getelementptr inbounds float* %tmp14685, i64 1
+  %tmp14687 = getelementptr inbounds float* %tmp14686, i64 1
+  %tmp14688 = getelementptr inbounds float* %tmp14687, i64 1
+  %tmp14689 = getelementptr inbounds float* %tmp14688, i64 1
+  %tmp14690 = getelementptr inbounds float* %tmp14689, i64 1
+  %tmp14691 = getelementptr inbounds float* %tmp14690, i64 1
+  %tmp14692 = getelementptr inbounds float* %tmp14691, i64 1
+  %tmp14693 = getelementptr inbounds float* %tmp14692, i64 1
+  %tmp14694 = getelementptr inbounds float* %tmp14693, i64 1
+  %tmp14695 = getelementptr inbounds float* %tmp14694, i64 1
+  %tmp14696 = getelementptr inbounds float* %tmp14695, i64 1
+  %tmp14697 = getelementptr inbounds float* %tmp14696, i64 1
+  %tmp14698 = getelementptr inbounds float* %tmp14697, i64 1
+  %tmp14699 = getelementptr inbounds float* %tmp14698, i64 1
+  %tmp14700 = getelementptr inbounds float* %tmp14699, i64 1
+  %tmp14701 = getelementptr inbounds float* %tmp14700, i64 1
+  %tmp14702 = getelementptr inbounds float* %tmp14701, i64 1
+  %tmp14703 = getelementptr inbounds float* %tmp14702, i64 1
+  %tmp14704 = getelementptr inbounds float* %tmp14703, i64 1
+  %tmp14705 = getelementptr inbounds float* %tmp14704, i64 1
+  %tmp14706 = getelementptr inbounds float* %tmp14705, i64 1
+  %tmp14707 = getelementptr inbounds float* %tmp14706, i64 1
+  %tmp14708 = getelementptr inbounds float* %tmp14707, i64 1
+  %tmp14709 = getelementptr inbounds float* %tmp14708, i64 1
+  %tmp14710 = getelementptr inbounds float* %tmp14709, i64 1
+  %tmp14711 = getelementptr inbounds float* %tmp14710, i64 1
+  %tmp14712 = getelementptr inbounds float* %tmp14711, i64 1
+  %tmp14713 = getelementptr inbounds float* %tmp14712, i64 1
+  %tmp14714 = getelementptr inbounds float* %tmp14713, i64 1
+  %tmp14715 = getelementptr inbounds float* %tmp14714, i64 1
+  %tmp14716 = getelementptr inbounds float* %tmp14715, i64 1
+  %tmp14717 = getelementptr inbounds float* %tmp14716, i64 1
+  %tmp14718 = getelementptr inbounds float* %tmp14717, i64 1
+  %tmp14719 = getelementptr inbounds float* %tmp14718, i64 1
+  %tmp14720 = getelementptr inbounds float* %tmp14719, i64 1
+  %tmp14721 = getelementptr inbounds float* %tmp14720, i64 1
+  %tmp14722 = getelementptr inbounds float* %tmp14721, i64 1
+  %tmp14723 = getelementptr inbounds float* %tmp14722, i64 1
+  %tmp14724 = getelementptr inbounds float* %tmp14723, i64 1
+  %tmp14725 = getelementptr inbounds float* %tmp14724, i64 1
+  %tmp14726 = getelementptr inbounds float* %tmp14725, i64 1
+  %tmp14727 = getelementptr inbounds float* %tmp14726, i64 1
+  %tmp14728 = getelementptr inbounds float* %tmp14727, i64 1
+  %tmp14729 = getelementptr inbounds float* %tmp14728, i64 1
+  %tmp14730 = getelementptr inbounds float* %tmp14729, i64 1
+  %tmp14731 = getelementptr inbounds float* %tmp14730, i64 1
+  %tmp14732 = getelementptr inbounds float* %tmp14731, i64 1
+  %tmp14733 = getelementptr inbounds float* %tmp14732, i64 1
+  %tmp14734 = getelementptr inbounds float* %tmp14733, i64 1
+  %tmp14735 = getelementptr inbounds float* %tmp14734, i64 1
+  %tmp14736 = getelementptr inbounds float* %tmp14735, i64 1
+  %tmp14737 = getelementptr inbounds float* %tmp14736, i64 1
+  %tmp14738 = getelementptr inbounds float* %tmp14737, i64 1
+  %tmp14739 = getelementptr inbounds float* %tmp14738, i64 1
+  %tmp14740 = getelementptr inbounds float* %tmp14739, i64 1
+  %tmp14741 = getelementptr inbounds float* %tmp14740, i64 1
+  %tmp14742 = getelementptr inbounds float* %tmp14741, i64 1
+  %tmp14743 = getelementptr inbounds float* %tmp14742, i64 1
+  %tmp14744 = getelementptr inbounds float* %tmp14743, i64 1
+  %tmp14745 = getelementptr inbounds float* %tmp14744, i64 1
+  %tmp14746 = getelementptr inbounds float* %tmp14745, i64 1
+  %tmp14747 = getelementptr inbounds float* %tmp14746, i64 1
+  %tmp14748 = getelementptr inbounds float* %tmp14747, i64 1
+  %tmp14749 = getelementptr inbounds float* %tmp14748, i64 1
+  %tmp14750 = getelementptr inbounds float* %tmp14749, i64 1
+  %tmp14751 = getelementptr inbounds float* %tmp14750, i64 1
+  %tmp14752 = getelementptr inbounds float* %tmp14751, i64 1
+  %tmp14753 = getelementptr inbounds float* %tmp14752, i64 1
+  %tmp14754 = getelementptr inbounds float* %tmp14753, i64 1
+  %tmp14755 = getelementptr inbounds float* %tmp14754, i64 1
+  %tmp14756 = getelementptr inbounds float* %tmp14755, i64 1
+  %tmp14757 = getelementptr inbounds float* %tmp14756, i64 1
+  %tmp14758 = getelementptr inbounds float* %tmp14757, i64 1
+  %tmp14759 = getelementptr inbounds float* %tmp14758, i64 1
+  %tmp14760 = getelementptr inbounds float* %tmp14759, i64 1
+  %tmp14761 = getelementptr inbounds float* %tmp14760, i64 1
+  %tmp14762 = getelementptr inbounds float* %tmp14761, i64 1
+  %tmp14763 = getelementptr inbounds float* %tmp14762, i64 1
+  %tmp14764 = getelementptr inbounds float* %tmp14763, i64 1
+  %tmp14765 = getelementptr inbounds float* %tmp14764, i64 1
+  %tmp14766 = getelementptr inbounds float* %tmp14765, i64 1
+  %tmp14767 = getelementptr inbounds float* %tmp14766, i64 1
+  %tmp14768 = getelementptr inbounds float* %tmp14767, i64 1
+  %tmp14769 = getelementptr inbounds float* %tmp14768, i64 1
+  %tmp14770 = getelementptr inbounds float* %tmp14769, i64 1
+  %tmp14771 = getelementptr inbounds float* %tmp14770, i64 1
+  %tmp14772 = getelementptr inbounds float* %tmp14771, i64 1
+  %tmp14773 = getelementptr inbounds float* %tmp14772, i64 1
+  %tmp14774 = getelementptr inbounds float* %tmp14773, i64 1
+  %tmp14775 = getelementptr inbounds float* %tmp14774, i64 1
+  %tmp14776 = getelementptr inbounds float* %tmp14775, i64 1
+  %tmp14777 = getelementptr inbounds float* %tmp14776, i64 1
+  %tmp14778 = getelementptr inbounds float* %tmp14777, i64 1
+  %tmp14779 = getelementptr inbounds float* %tmp14778, i64 1
+  %tmp14780 = getelementptr inbounds float* %tmp14779, i64 1
+  %tmp14781 = getelementptr inbounds float* %tmp14780, i64 1
+  %tmp14782 = getelementptr inbounds float* %tmp14781, i64 1
+  %tmp14783 = getelementptr inbounds float* %tmp14782, i64 1
+  %tmp14784 = getelementptr inbounds float* %tmp14783, i64 1
+  %tmp14785 = getelementptr inbounds float* %tmp14784, i64 1
+  %tmp14786 = getelementptr inbounds float* %tmp14785, i64 1
+  %tmp14787 = getelementptr inbounds float* %tmp14786, i64 1
+  %tmp14788 = getelementptr inbounds float* %tmp14787, i64 1
+  %tmp14789 = getelementptr inbounds float* %tmp14788, i64 1
+  %tmp14790 = getelementptr inbounds float* %tmp14789, i64 1
+  %tmp14791 = getelementptr inbounds float* %tmp14790, i64 1
+  %tmp14792 = getelementptr inbounds float* %tmp14791, i64 1
+  %tmp14793 = getelementptr inbounds float* %tmp14792, i64 1
+  %tmp14794 = getelementptr inbounds float* %tmp14793, i64 1
+  %tmp14795 = getelementptr inbounds float* %tmp14794, i64 1
+  %tmp14796 = getelementptr inbounds float* %tmp14795, i64 1
+  %tmp14797 = getelementptr inbounds float* %tmp14796, i64 1
+  %tmp14798 = getelementptr inbounds float* %tmp14797, i64 1
+  %tmp14799 = getelementptr inbounds float* %tmp14798, i64 1
+  %tmp14800 = getelementptr inbounds float* %tmp14799, i64 1
+  %tmp14801 = getelementptr inbounds float* %tmp14800, i64 1
+  %tmp14802 = getelementptr inbounds float* %tmp14801, i64 1
+  %tmp14803 = getelementptr inbounds float* %tmp14802, i64 1
+  %tmp14804 = getelementptr inbounds float* %tmp14803, i64 1
+  %tmp14805 = getelementptr inbounds float* %tmp14804, i64 1
+  %tmp14806 = getelementptr inbounds float* %tmp14805, i64 1
+  %tmp14807 = getelementptr inbounds float* %tmp14806, i64 1
+  %tmp14808 = getelementptr inbounds float* %tmp14807, i64 1
+  %tmp14809 = getelementptr inbounds float* %tmp14808, i64 1
+  %tmp14810 = getelementptr inbounds float* %tmp14809, i64 1
+  %tmp14811 = getelementptr inbounds float* %tmp14810, i64 1
+  %tmp14812 = getelementptr inbounds float* %tmp14811, i64 1
+  %tmp14813 = getelementptr inbounds float* %tmp14812, i64 1
+  %tmp14814 = getelementptr inbounds float* %tmp14813, i64 1
+  %tmp14815 = getelementptr inbounds float* %tmp14814, i64 1
+  %tmp14816 = getelementptr inbounds float* %tmp14815, i64 1
+  %tmp14817 = getelementptr inbounds float* %tmp14816, i64 1
+  %tmp14818 = getelementptr inbounds float* %tmp14817, i64 1
+  %tmp14819 = getelementptr inbounds float* %tmp14818, i64 1
+  %tmp14820 = getelementptr inbounds float* %tmp14819, i64 1
+  %tmp14821 = getelementptr inbounds float* %tmp14820, i64 1
+  %tmp14822 = getelementptr inbounds float* %tmp14821, i64 1
+  %tmp14823 = getelementptr inbounds float* %tmp14822, i64 1
+  %tmp14824 = getelementptr inbounds float* %tmp14823, i64 1
+  %tmp14825 = getelementptr inbounds float* %tmp14824, i64 1
+  %tmp14826 = getelementptr inbounds float* %tmp14825, i64 1
+  %tmp14827 = getelementptr inbounds float* %tmp14826, i64 1
+  %tmp14828 = getelementptr inbounds float* %tmp14827, i64 1
+  %tmp14829 = getelementptr inbounds float* %tmp14828, i64 1
+  %tmp14830 = getelementptr inbounds float* %tmp14829, i64 1
+  %tmp14831 = getelementptr inbounds float* %tmp14830, i64 1
+  %tmp14832 = getelementptr inbounds float* %tmp14831, i64 1
+  %tmp14833 = getelementptr inbounds float* %tmp14832, i64 1
+  %tmp14834 = getelementptr inbounds float* %tmp14833, i64 1
+  %tmp14835 = getelementptr inbounds float* %tmp14834, i64 1
+  %tmp14836 = getelementptr inbounds float* %tmp14835, i64 1
+  %tmp14837 = getelementptr inbounds float* %tmp14836, i64 1
+  %tmp14838 = getelementptr inbounds float* %tmp14837, i64 1
+  %tmp14839 = getelementptr inbounds float* %tmp14838, i64 1
+  %tmp14840 = getelementptr inbounds float* %tmp14839, i64 1
+  %tmp14841 = getelementptr inbounds float* %tmp14840, i64 1
+  %tmp14842 = getelementptr inbounds float* %tmp14841, i64 1
+  %tmp14843 = getelementptr inbounds float* %tmp14842, i64 1
+  %tmp14844 = getelementptr inbounds float* %tmp14843, i64 1
+  %tmp14845 = getelementptr inbounds float* %tmp14844, i64 1
+  %tmp14846 = getelementptr inbounds float* %tmp14845, i64 1
+  %tmp14847 = getelementptr inbounds float* %tmp14846, i64 1
+  %tmp14848 = getelementptr inbounds float* %tmp14847, i64 1
+  %tmp14849 = getelementptr inbounds float* %tmp14848, i64 1
+  %tmp14850 = getelementptr inbounds float* %tmp14849, i64 1
+  %tmp14851 = getelementptr inbounds float* %tmp14850, i64 1
+  %tmp14852 = getelementptr inbounds float* %tmp14851, i64 1
+  %tmp14853 = getelementptr inbounds float* %tmp14852, i64 1
+  %tmp14854 = getelementptr inbounds float* %tmp14853, i64 1
+  %tmp14855 = getelementptr inbounds float* %tmp14854, i64 1
+  %tmp14856 = getelementptr inbounds float* %tmp14855, i64 1
+  %tmp14857 = getelementptr inbounds float* %tmp14856, i64 1
+  %tmp14858 = getelementptr inbounds float* %tmp14857, i64 1
+  %tmp14859 = getelementptr inbounds float* %tmp14858, i64 1
+  %tmp14860 = getelementptr inbounds float* %tmp14859, i64 1
+  %tmp14861 = getelementptr inbounds float* %tmp14860, i64 1
+  %tmp14862 = getelementptr inbounds float* %tmp14861, i64 1
+  %tmp14863 = getelementptr inbounds float* %tmp14862, i64 1
+  %tmp14864 = getelementptr inbounds float* %tmp14863, i64 1
+  %tmp14865 = getelementptr inbounds float* %tmp14864, i64 1
+  %tmp14866 = getelementptr inbounds float* %tmp14865, i64 1
+  %tmp14867 = getelementptr inbounds float* %tmp14866, i64 1
+  %tmp14868 = getelementptr inbounds float* %tmp14867, i64 1
+  %tmp14869 = getelementptr inbounds float* %tmp14868, i64 1
+  %tmp14870 = getelementptr inbounds float* %tmp14869, i64 1
+  %tmp14871 = getelementptr inbounds float* %tmp14870, i64 1
+  %tmp14872 = getelementptr inbounds float* %tmp14871, i64 1
+  %tmp14873 = getelementptr inbounds float* %tmp14872, i64 1
+  %tmp14874 = getelementptr inbounds float* %tmp14873, i64 1
+  %tmp14875 = getelementptr inbounds float* %tmp14874, i64 1
+  %tmp14876 = getelementptr inbounds float* %tmp14875, i64 1
+  %tmp14877 = getelementptr inbounds float* %tmp14876, i64 1
+  %tmp14878 = getelementptr inbounds float* %tmp14877, i64 1
+  %tmp14879 = getelementptr inbounds float* %tmp14878, i64 1
+  %tmp14880 = getelementptr inbounds float* %tmp14879, i64 1
+  %tmp14881 = getelementptr inbounds float* %tmp14880, i64 1
+  %tmp14882 = getelementptr inbounds float* %tmp14881, i64 1
+  %tmp14883 = getelementptr inbounds float* %tmp14882, i64 1
+  %tmp14884 = getelementptr inbounds float* %tmp14883, i64 1
+  %tmp14885 = getelementptr inbounds float* %tmp14884, i64 1
+  %tmp14886 = getelementptr inbounds float* %tmp14885, i64 1
+  %tmp14887 = getelementptr inbounds float* %tmp14886, i64 1
+  %tmp14888 = getelementptr inbounds float* %tmp14887, i64 1
+  %tmp14889 = getelementptr inbounds float* %tmp14888, i64 1
+  %tmp14890 = getelementptr inbounds float* %tmp14889, i64 1
+  %tmp14891 = getelementptr inbounds float* %tmp14890, i64 1
+  %tmp14892 = getelementptr inbounds float* %tmp14891, i64 1
+  %tmp14893 = getelementptr inbounds float* %tmp14892, i64 1
+  %tmp14894 = getelementptr inbounds float* %tmp14893, i64 1
+  %tmp14895 = getelementptr inbounds float* %tmp14894, i64 1
+  %tmp14896 = getelementptr inbounds float* %tmp14895, i64 1
+  %tmp14897 = getelementptr inbounds float* %tmp14896, i64 1
+  %tmp14898 = getelementptr inbounds float* %tmp14897, i64 1
+  %tmp14899 = getelementptr inbounds float* %tmp14898, i64 1
+  %tmp14900 = getelementptr inbounds float* %tmp14899, i64 1
+  %tmp14901 = getelementptr inbounds float* %tmp14900, i64 1
+  %tmp14902 = getelementptr inbounds float* %tmp14901, i64 1
+  %tmp14903 = getelementptr inbounds float* %tmp14902, i64 1
+  %tmp14904 = getelementptr inbounds float* %tmp14903, i64 1
+  %tmp14905 = getelementptr inbounds float* %tmp14904, i64 1
+  %tmp14906 = getelementptr inbounds float* %tmp14905, i64 1
+  %tmp14907 = getelementptr inbounds float* %tmp14906, i64 1
+  %tmp14908 = getelementptr inbounds float* %tmp14907, i64 1
+  %tmp14909 = getelementptr inbounds float* %tmp14908, i64 1
+  %tmp14910 = getelementptr inbounds float* %tmp14909, i64 1
+  %tmp14911 = getelementptr inbounds float* %tmp14910, i64 1
+  %tmp14912 = getelementptr inbounds float* %tmp14911, i64 1
+  %tmp14913 = getelementptr inbounds float* %tmp14912, i64 1
+  %tmp14914 = getelementptr inbounds float* %tmp14913, i64 1
+  %tmp14915 = getelementptr inbounds float* %tmp14914, i64 1
+  %tmp14916 = getelementptr inbounds float* %tmp14915, i64 1
+  %tmp14917 = getelementptr inbounds float* %tmp14916, i64 1
+  %tmp14918 = getelementptr inbounds float* %tmp14917, i64 1
+  %tmp14919 = getelementptr inbounds float* %tmp14918, i64 1
+  %tmp14920 = getelementptr inbounds float* %tmp14919, i64 1
+  %tmp14921 = getelementptr inbounds float* %tmp14920, i64 1
+  %tmp14922 = getelementptr inbounds float* %tmp14921, i64 1
+  %tmp14923 = getelementptr inbounds float* %tmp14922, i64 1
+  %tmp14924 = getelementptr inbounds float* %tmp14923, i64 1
+  %tmp14925 = getelementptr inbounds float* %tmp14924, i64 1
+  %tmp14926 = getelementptr inbounds float* %tmp14925, i64 1
+  %tmp14927 = getelementptr inbounds float* %tmp14926, i64 1
+  %tmp14928 = getelementptr inbounds float* %tmp14927, i64 1
+  %tmp14929 = getelementptr inbounds float* %tmp14928, i64 1
+  %tmp14930 = getelementptr inbounds float* %tmp14929, i64 1
+  %tmp14931 = getelementptr inbounds float* %tmp14930, i64 1
+  %tmp14932 = getelementptr inbounds float* %tmp14931, i64 1
+  %tmp14933 = getelementptr inbounds float* %tmp14932, i64 1
+  %tmp14934 = getelementptr inbounds float* %tmp14933, i64 1
+  %tmp14935 = getelementptr inbounds float* %tmp14934, i64 1
+  %tmp14936 = getelementptr inbounds float* %tmp14935, i64 1
+  %tmp14937 = getelementptr inbounds float* %tmp14936, i64 1
+  %tmp14938 = getelementptr inbounds float* %tmp14937, i64 1
+  %tmp14939 = getelementptr inbounds float* %tmp14938, i64 1
+  %tmp14940 = getelementptr inbounds float* %tmp14939, i64 1
+  %tmp14941 = getelementptr inbounds float* %tmp14940, i64 1
+  %tmp14942 = getelementptr inbounds float* %tmp14941, i64 1
+  %tmp14943 = getelementptr inbounds float* %tmp14942, i64 1
+  %tmp14944 = getelementptr inbounds float* %tmp14943, i64 1
+  %tmp14945 = getelementptr inbounds float* %tmp14944, i64 1
+  %tmp14946 = getelementptr inbounds float* %tmp14945, i64 1
+  %tmp14947 = getelementptr inbounds float* %tmp14946, i64 1
+  %tmp14948 = getelementptr inbounds float* %tmp14947, i64 1
+  %tmp14949 = getelementptr inbounds float* %tmp14948, i64 1
+  %tmp14950 = getelementptr inbounds float* %tmp14949, i64 1
+  %tmp14951 = getelementptr inbounds float* %tmp14950, i64 1
+  %tmp14952 = getelementptr inbounds float* %tmp14951, i64 1
+  %tmp14953 = getelementptr inbounds float* %tmp14952, i64 1
+  %tmp14954 = getelementptr inbounds float* %tmp14953, i64 1
+  %tmp14955 = getelementptr inbounds float* %tmp14954, i64 1
+  %tmp14956 = getelementptr inbounds float* %tmp14955, i64 1
+  %tmp14957 = getelementptr inbounds float* %tmp14956, i64 1
+  %tmp14958 = getelementptr inbounds float* %tmp14957, i64 1
+  %tmp14959 = getelementptr inbounds float* %tmp14958, i64 1
+  %tmp14960 = getelementptr inbounds float* %tmp14959, i64 1
+  %tmp14961 = getelementptr inbounds float* %tmp14960, i64 1
+  %tmp14962 = getelementptr inbounds float* %tmp14961, i64 1
+  %tmp14963 = getelementptr inbounds float* %tmp14962, i64 1
+  %tmp14964 = getelementptr inbounds float* %tmp14963, i64 1
+  %tmp14965 = getelementptr inbounds float* %tmp14964, i64 1
+  %tmp14966 = getelementptr inbounds float* %tmp14965, i64 1
+  %tmp14967 = getelementptr inbounds float* %tmp14966, i64 1
+  %tmp14968 = getelementptr inbounds float* %tmp14967, i64 1
+  %tmp14969 = getelementptr inbounds float* %tmp14968, i64 1
+  %tmp14970 = getelementptr inbounds float* %tmp14969, i64 1
+  %tmp14971 = getelementptr inbounds float* %tmp14970, i64 1
+  %tmp14972 = getelementptr inbounds float* %tmp14971, i64 1
+  %tmp14973 = getelementptr inbounds float* %tmp14972, i64 1
+  %tmp14974 = getelementptr inbounds float* %tmp14973, i64 1
+  %tmp14975 = getelementptr inbounds float* %tmp14974, i64 1
+  %tmp14976 = getelementptr inbounds float* %tmp14975, i64 1
+  %tmp14977 = getelementptr inbounds float* %tmp14976, i64 1
+  %tmp14978 = getelementptr inbounds float* %tmp14977, i64 1
+  %tmp14979 = getelementptr inbounds float* %tmp14978, i64 1
+  %tmp14980 = getelementptr inbounds float* %tmp14979, i64 1
+  %tmp14981 = getelementptr inbounds float* %tmp14980, i64 1
+  %tmp14982 = getelementptr inbounds float* %tmp14981, i64 1
+  %tmp14983 = getelementptr inbounds float* %tmp14982, i64 1
+  %tmp14984 = getelementptr inbounds float* %tmp14983, i64 1
+  %tmp14985 = getelementptr inbounds float* %tmp14984, i64 1
+  %tmp14986 = getelementptr inbounds float* %tmp14985, i64 1
+  %tmp14987 = getelementptr inbounds float* %tmp14986, i64 1
+  %tmp14988 = getelementptr inbounds float* %tmp14987, i64 1
+  %tmp14989 = getelementptr inbounds float* %tmp14988, i64 1
+  %tmp14990 = getelementptr inbounds float* %tmp14989, i64 1
+  %tmp14991 = getelementptr inbounds float* %tmp14990, i64 1
+  %tmp14992 = getelementptr inbounds float* %tmp14991, i64 1
+  %tmp14993 = getelementptr inbounds float* %tmp14992, i64 1
+  %tmp14994 = getelementptr inbounds float* %tmp14993, i64 1
+  %tmp14995 = getelementptr inbounds float* %tmp14994, i64 1
+  %tmp14996 = getelementptr inbounds float* %tmp14995, i64 1
+  %tmp14997 = getelementptr inbounds float* %tmp14996, i64 1
+  %tmp14998 = getelementptr inbounds float* %tmp14997, i64 1
+  %tmp14999 = getelementptr inbounds float* %tmp14998, i64 1
+  %tmp15000 = getelementptr inbounds float* %tmp14999, i64 1
+  %tmp15001 = getelementptr inbounds float* %tmp15000, i64 1
+  %tmp15002 = getelementptr inbounds float* %tmp15001, i64 1
+  %tmp15003 = getelementptr inbounds float* %tmp15002, i64 1
+  %tmp15004 = getelementptr inbounds float* %tmp15003, i64 1
+  %tmp15005 = getelementptr inbounds float* %tmp15004, i64 1
+  %tmp15006 = getelementptr inbounds float* %tmp15005, i64 1
+  %tmp15007 = getelementptr inbounds float* %tmp15006, i64 1
+  %tmp15008 = getelementptr inbounds float* %tmp15007, i64 1
+  %tmp15009 = getelementptr inbounds float* %tmp15008, i64 1
+  %tmp15010 = getelementptr inbounds float* %tmp15009, i64 1
+  %tmp15011 = getelementptr inbounds float* %tmp15010, i64 1
+  %tmp15012 = getelementptr inbounds float* %tmp15011, i64 1
+  %tmp15013 = getelementptr inbounds float* %tmp15012, i64 1
+  %tmp15014 = getelementptr inbounds float* %tmp15013, i64 1
+  %tmp15015 = getelementptr inbounds float* %tmp15014, i64 1
+  %tmp15016 = getelementptr inbounds float* %tmp15015, i64 1
+  %tmp15017 = getelementptr inbounds float* %tmp15016, i64 1
+  %tmp15018 = getelementptr inbounds float* %tmp15017, i64 1
+  %tmp15019 = getelementptr inbounds float* %tmp15018, i64 1
+  %tmp15020 = getelementptr inbounds float* %tmp15019, i64 1
+  %tmp15021 = getelementptr inbounds float* %tmp15020, i64 1
+  %tmp15022 = getelementptr inbounds float* %tmp15021, i64 1
+  %tmp15023 = getelementptr inbounds float* %tmp15022, i64 1
+  %tmp15024 = getelementptr inbounds float* %tmp15023, i64 1
+  %tmp15025 = getelementptr inbounds float* %tmp15024, i64 1
+  %tmp15026 = getelementptr inbounds float* %tmp15025, i64 1
+  %tmp15027 = getelementptr inbounds float* %tmp15026, i64 1
+  %tmp15028 = getelementptr inbounds float* %tmp15027, i64 1
+  %tmp15029 = getelementptr inbounds float* %tmp15028, i64 1
+  %tmp15030 = getelementptr inbounds float* %tmp15029, i64 1
+  %tmp15031 = getelementptr inbounds float* %tmp15030, i64 1
+  %tmp15032 = getelementptr inbounds float* %tmp15031, i64 1
+  %tmp15033 = getelementptr inbounds float* %tmp15032, i64 1
+  %tmp15034 = getelementptr inbounds float* %tmp15033, i64 1
+  %tmp15035 = getelementptr inbounds float* %tmp15034, i64 1
+  %tmp15036 = getelementptr inbounds float* %tmp15035, i64 1
+  %tmp15037 = getelementptr inbounds float* %tmp15036, i64 1
+  %tmp15038 = getelementptr inbounds float* %tmp15037, i64 1
+  %tmp15039 = getelementptr inbounds float* %tmp15038, i64 1
+  %tmp15040 = getelementptr inbounds float* %tmp15039, i64 1
+  %tmp15041 = getelementptr inbounds float* %tmp15040, i64 1
+  %tmp15042 = getelementptr inbounds float* %tmp15041, i64 1
+  %tmp15043 = getelementptr inbounds float* %tmp15042, i64 1
+  %tmp15044 = getelementptr inbounds float* %tmp15043, i64 1
+  %tmp15045 = getelementptr inbounds float* %tmp15044, i64 1
+  %tmp15046 = getelementptr inbounds float* %tmp15045, i64 1
+  %tmp15047 = getelementptr inbounds float* %tmp15046, i64 1
+  %tmp15048 = getelementptr inbounds float* %tmp15047, i64 1
+  %tmp15049 = getelementptr inbounds float* %tmp15048, i64 1
+  %tmp15050 = getelementptr inbounds float* %tmp15049, i64 1
+  %tmp15051 = getelementptr inbounds float* %tmp15050, i64 1
+  %tmp15052 = getelementptr inbounds float* %tmp15051, i64 1
+  %tmp15053 = getelementptr inbounds float* %tmp15052, i64 1
+  %tmp15054 = getelementptr inbounds float* %tmp15053, i64 1
+  %tmp15055 = getelementptr inbounds float* %tmp15054, i64 1
+  %tmp15056 = getelementptr inbounds float* %tmp15055, i64 1
+  %tmp15057 = getelementptr inbounds float* %tmp15056, i64 1
+  %tmp15058 = getelementptr inbounds float* %tmp15057, i64 1
+  %tmp15059 = getelementptr inbounds float* %tmp15058, i64 1
+  %tmp15060 = getelementptr inbounds float* %tmp15059, i64 1
+  %tmp15061 = getelementptr inbounds float* %tmp15060, i64 1
+  %tmp15062 = getelementptr inbounds float* %tmp15061, i64 1
+  %tmp15063 = getelementptr inbounds float* %tmp15062, i64 1
+  %tmp15064 = getelementptr inbounds float* %tmp15063, i64 1
+  %tmp15065 = getelementptr inbounds float* %tmp15064, i64 1
+  %tmp15066 = getelementptr inbounds float* %tmp15065, i64 1
+  %tmp15067 = getelementptr inbounds float* %tmp15066, i64 1
+  %tmp15068 = getelementptr inbounds float* %tmp15067, i64 1
+  %tmp15069 = getelementptr inbounds float* %tmp15068, i64 1
+  %tmp15070 = getelementptr inbounds float* %tmp15069, i64 1
+  %tmp15071 = getelementptr inbounds float* %tmp15070, i64 1
+  %tmp15072 = getelementptr inbounds float* %tmp15071, i64 1
+  %tmp15073 = getelementptr inbounds float* %tmp15072, i64 1
+  %tmp15074 = getelementptr inbounds float* %tmp15073, i64 1
+  %tmp15075 = getelementptr inbounds float* %tmp15074, i64 1
+  %tmp15076 = getelementptr inbounds float* %tmp15075, i64 1
+  %tmp15077 = getelementptr inbounds float* %tmp15076, i64 1
+  %tmp15078 = getelementptr inbounds float* %tmp15077, i64 1
+  %tmp15079 = getelementptr inbounds float* %tmp15078, i64 1
+  %tmp15080 = getelementptr inbounds float* %tmp15079, i64 1
+  %tmp15081 = getelementptr inbounds float* %tmp15080, i64 1
+  %tmp15082 = getelementptr inbounds float* %tmp15081, i64 1
+  %tmp15083 = getelementptr inbounds float* %tmp15082, i64 1
+  %tmp15084 = getelementptr inbounds float* %tmp15083, i64 1
+  %tmp15085 = getelementptr inbounds float* %tmp15084, i64 1
+  %tmp15086 = getelementptr inbounds float* %tmp15085, i64 1
+  %tmp15087 = getelementptr inbounds float* %tmp15086, i64 1
+  %tmp15088 = getelementptr inbounds float* %tmp15087, i64 1
+  %tmp15089 = getelementptr inbounds float* %tmp15088, i64 1
+  %tmp15090 = getelementptr inbounds float* %tmp15089, i64 1
+  %tmp15091 = getelementptr inbounds float* %tmp15090, i64 1
+  %tmp15092 = getelementptr inbounds float* %tmp15091, i64 1
+  %tmp15093 = getelementptr inbounds float* %tmp15092, i64 1
+  %tmp15094 = getelementptr inbounds float* %tmp15093, i64 1
+  %tmp15095 = getelementptr inbounds float* %tmp15094, i64 1
+  %tmp15096 = getelementptr inbounds float* %tmp15095, i64 1
+  %tmp15097 = getelementptr inbounds float* %tmp15096, i64 1
+  %tmp15098 = getelementptr inbounds float* %tmp15097, i64 1
+  %tmp15099 = getelementptr inbounds float* %tmp15098, i64 1
+  %tmp15100 = getelementptr inbounds float* %tmp15099, i64 1
+  %tmp15101 = getelementptr inbounds float* %tmp15100, i64 1
+  %tmp15102 = getelementptr inbounds float* %tmp15101, i64 1
+  %tmp15103 = getelementptr inbounds float* %tmp15102, i64 1
+  %tmp15104 = getelementptr inbounds float* %tmp15103, i64 1
+  %tmp15105 = getelementptr inbounds float* %tmp15104, i64 1
+  %tmp15106 = getelementptr inbounds float* %tmp15105, i64 1
+  %tmp15107 = getelementptr inbounds float* %tmp15106, i64 1
+  %tmp15108 = getelementptr inbounds float* %tmp15107, i64 1
+  %tmp15109 = getelementptr inbounds float* %tmp15108, i64 1
+  %tmp15110 = getelementptr inbounds float* %tmp15109, i64 1
+  %tmp15111 = getelementptr inbounds float* %tmp15110, i64 1
+  %tmp15112 = getelementptr inbounds float* %tmp15111, i64 1
+  %tmp15113 = getelementptr inbounds float* %tmp15112, i64 1
+  %tmp15114 = getelementptr inbounds float* %tmp15113, i64 1
+  %tmp15115 = getelementptr inbounds float* %tmp15114, i64 1
+  %tmp15116 = getelementptr inbounds float* %tmp15115, i64 1
+  %tmp15117 = getelementptr inbounds float* %tmp15116, i64 1
+  %tmp15118 = getelementptr inbounds float* %tmp15117, i64 1
+  %tmp15119 = getelementptr inbounds float* %tmp15118, i64 1
+  %tmp15120 = getelementptr inbounds float* %tmp15119, i64 1
+  %tmp15121 = getelementptr inbounds float* %tmp15120, i64 1
+  %tmp15122 = getelementptr inbounds float* %tmp15121, i64 1
+  %tmp15123 = getelementptr inbounds float* %tmp15122, i64 1
+  %tmp15124 = getelementptr inbounds float* %tmp15123, i64 1
+  %tmp15125 = getelementptr inbounds float* %tmp15124, i64 1
+  %tmp15126 = getelementptr inbounds float* %tmp15125, i64 1
+  %tmp15127 = getelementptr inbounds float* %tmp15126, i64 1
+  %tmp15128 = getelementptr inbounds float* %tmp15127, i64 1
+  %tmp15129 = getelementptr inbounds float* %tmp15128, i64 1
+  %tmp15130 = getelementptr inbounds float* %tmp15129, i64 1
+  %tmp15131 = getelementptr inbounds float* %tmp15130, i64 1
+  %tmp15132 = getelementptr inbounds float* %tmp15131, i64 1
+  %tmp15133 = getelementptr inbounds float* %tmp15132, i64 1
+  %tmp15134 = getelementptr inbounds float* %tmp15133, i64 1
+  %tmp15135 = getelementptr inbounds float* %tmp15134, i64 1
+  %tmp15136 = getelementptr inbounds float* %tmp15135, i64 1
+  %tmp15137 = getelementptr inbounds float* %tmp15136, i64 1
+  %tmp15138 = getelementptr inbounds float* %tmp15137, i64 1
+  %tmp15139 = getelementptr inbounds float* %tmp15138, i64 1
+  %tmp15140 = getelementptr inbounds float* %tmp15139, i64 1
+  %tmp15141 = getelementptr inbounds float* %tmp15140, i64 1
+  %tmp15142 = getelementptr inbounds float* %tmp15141, i64 1
+  %tmp15143 = getelementptr inbounds float* %tmp15142, i64 1
+  %tmp15144 = getelementptr inbounds float* %tmp15143, i64 1
+  %tmp15145 = getelementptr inbounds float* %tmp15144, i64 1
+  %tmp15146 = getelementptr inbounds float* %tmp15145, i64 1
+  %tmp15147 = getelementptr inbounds float* %tmp15146, i64 1
+  %tmp15148 = getelementptr inbounds float* %tmp15147, i64 1
+  %tmp15149 = getelementptr inbounds float* %tmp15148, i64 1
+  %tmp15150 = getelementptr inbounds float* %tmp15149, i64 1
+  %tmp15151 = getelementptr inbounds float* %tmp15150, i64 1
+  %tmp15152 = getelementptr inbounds float* %tmp15151, i64 1
+  %tmp15153 = getelementptr inbounds float* %tmp15152, i64 1
+  %tmp15154 = getelementptr inbounds float* %tmp15153, i64 1
+  %tmp15155 = getelementptr inbounds float* %tmp15154, i64 1
+  %tmp15156 = getelementptr inbounds float* %tmp15155, i64 1
+  %tmp15157 = getelementptr inbounds float* %tmp15156, i64 1
+  %tmp15158 = getelementptr inbounds float* %tmp15157, i64 1
+  %tmp15159 = getelementptr inbounds float* %tmp15158, i64 1
+  %tmp15160 = getelementptr inbounds float* %tmp15159, i64 1
+  %tmp15161 = getelementptr inbounds float* %tmp15160, i64 1
+  %tmp15162 = getelementptr inbounds float* %tmp15161, i64 1
+  %tmp15163 = getelementptr inbounds float* %tmp15162, i64 1
+  %tmp15164 = getelementptr inbounds float* %tmp15163, i64 1
+  %tmp15165 = getelementptr inbounds float* %tmp15164, i64 1
+  %tmp15166 = getelementptr inbounds float* %tmp15165, i64 1
+  %tmp15167 = getelementptr inbounds float* %tmp15166, i64 1
+  %tmp15168 = getelementptr inbounds float* %tmp15167, i64 1
+  %tmp15169 = getelementptr inbounds float* %tmp15168, i64 1
+  %tmp15170 = getelementptr inbounds float* %tmp15169, i64 1
+  %tmp15171 = getelementptr inbounds float* %tmp15170, i64 1
+  %tmp15172 = getelementptr inbounds float* %tmp15171, i64 1
+  %tmp15173 = getelementptr inbounds float* %tmp15172, i64 1
+  %tmp15174 = getelementptr inbounds float* %tmp15173, i64 1
+  %tmp15175 = getelementptr inbounds float* %tmp15174, i64 1
+  %tmp15176 = getelementptr inbounds float* %tmp15175, i64 1
+  %tmp15177 = getelementptr inbounds float* %tmp15176, i64 1
+  %tmp15178 = getelementptr inbounds float* %tmp15177, i64 1
+  %tmp15179 = getelementptr inbounds float* %tmp15178, i64 1
+  %tmp15180 = getelementptr inbounds float* %tmp15179, i64 1
+  %tmp15181 = getelementptr inbounds float* %tmp15180, i64 1
+  %tmp15182 = getelementptr inbounds float* %tmp15181, i64 1
+  %tmp15183 = getelementptr inbounds float* %tmp15182, i64 1
+  %tmp15184 = getelementptr inbounds float* %tmp15183, i64 1
+  %tmp15185 = getelementptr inbounds float* %tmp15184, i64 1
+  %tmp15186 = getelementptr inbounds float* %tmp15185, i64 1
+  %tmp15187 = getelementptr inbounds float* %tmp15186, i64 1
+  %tmp15188 = getelementptr inbounds float* %tmp15187, i64 1
+  %tmp15189 = getelementptr inbounds float* %tmp15188, i64 1
+  %tmp15190 = getelementptr inbounds float* %tmp15189, i64 1
+  %tmp15191 = getelementptr inbounds float* %tmp15190, i64 1
+  %tmp15192 = getelementptr inbounds float* %tmp15191, i64 1
+  %tmp15193 = getelementptr inbounds float* %tmp15192, i64 1
+  %tmp15194 = getelementptr inbounds float* %tmp15193, i64 1
+  %tmp15195 = getelementptr inbounds float* %tmp15194, i64 1
+  %tmp15196 = getelementptr inbounds float* %tmp15195, i64 1
+  %tmp15197 = getelementptr inbounds float* %tmp15196, i64 1
+  %tmp15198 = getelementptr inbounds float* %tmp15197, i64 1
+  %tmp15199 = getelementptr inbounds float* %tmp15198, i64 1
+  %tmp15200 = getelementptr inbounds float* %tmp15199, i64 1
+  %tmp15201 = getelementptr inbounds float* %tmp15200, i64 1
+  %tmp15202 = getelementptr inbounds float* %tmp15201, i64 1
+  %tmp15203 = getelementptr inbounds float* %tmp15202, i64 1
+  %tmp15204 = getelementptr inbounds float* %tmp15203, i64 1
+  %tmp15205 = getelementptr inbounds float* %tmp15204, i64 1
+  %tmp15206 = getelementptr inbounds float* %tmp15205, i64 1
+  %tmp15207 = getelementptr inbounds float* %tmp15206, i64 1
+  %tmp15208 = getelementptr inbounds float* %tmp15207, i64 1
+  %tmp15209 = getelementptr inbounds float* %tmp15208, i64 1
+  %tmp15210 = getelementptr inbounds float* %tmp15209, i64 1
+  %tmp15211 = getelementptr inbounds float* %tmp15210, i64 1
+  %tmp15212 = getelementptr inbounds float* %tmp15211, i64 1
+  %tmp15213 = getelementptr inbounds float* %tmp15212, i64 1
+  %tmp15214 = getelementptr inbounds float* %tmp15213, i64 1
+  %tmp15215 = getelementptr inbounds float* %tmp15214, i64 1
+  %tmp15216 = getelementptr inbounds float* %tmp15215, i64 1
+  %tmp15217 = getelementptr inbounds float* %tmp15216, i64 1
+  %tmp15218 = getelementptr inbounds float* %tmp15217, i64 1
+  %tmp15219 = getelementptr inbounds float* %tmp15218, i64 1
+  %tmp15220 = getelementptr inbounds float* %tmp15219, i64 1
+  %tmp15221 = getelementptr inbounds float* %tmp15220, i64 1
+  %tmp15222 = getelementptr inbounds float* %tmp15221, i64 1
+  %tmp15223 = getelementptr inbounds float* %tmp15222, i64 1
+  %tmp15224 = getelementptr inbounds float* %tmp15223, i64 1
+  %tmp15225 = getelementptr inbounds float* %tmp15224, i64 1
+  %tmp15226 = getelementptr inbounds float* %tmp15225, i64 1
+  %tmp15227 = getelementptr inbounds float* %tmp15226, i64 1
+  %tmp15228 = getelementptr inbounds float* %tmp15227, i64 1
+  %tmp15229 = getelementptr inbounds float* %tmp15228, i64 1
+  %tmp15230 = getelementptr inbounds float* %tmp15229, i64 1
+  %tmp15231 = getelementptr inbounds float* %tmp15230, i64 1
+  %tmp15232 = getelementptr inbounds float* %tmp15231, i64 1
+  %tmp15233 = getelementptr inbounds float* %tmp15232, i64 1
+  %tmp15234 = getelementptr inbounds float* %tmp15233, i64 1
+  %tmp15235 = getelementptr inbounds float* %tmp15234, i64 1
+  %tmp15236 = getelementptr inbounds float* %tmp15235, i64 1
+  %tmp15237 = getelementptr inbounds float* %tmp15236, i64 1
+  %tmp15238 = getelementptr inbounds float* %tmp15237, i64 1
+  %tmp15239 = getelementptr inbounds float* %tmp15238, i64 1
+  %tmp15240 = getelementptr inbounds float* %tmp15239, i64 1
+  %tmp15241 = getelementptr inbounds float* %tmp15240, i64 1
+  %tmp15242 = getelementptr inbounds float* %tmp15241, i64 1
+  %tmp15243 = getelementptr inbounds float* %tmp15242, i64 1
+  %tmp15244 = getelementptr inbounds float* %tmp15243, i64 1
+  %tmp15245 = getelementptr inbounds float* %tmp15244, i64 1
+  %tmp15246 = getelementptr inbounds float* %tmp15245, i64 1
+  %tmp15247 = getelementptr inbounds float* %tmp15246, i64 1
+  %tmp15248 = getelementptr inbounds float* %tmp15247, i64 1
+  %tmp15249 = getelementptr inbounds float* %tmp15248, i64 1
+  %tmp15250 = getelementptr inbounds float* %tmp15249, i64 1
+  %tmp15251 = getelementptr inbounds float* %tmp15250, i64 1
+  %tmp15252 = getelementptr inbounds float* %tmp15251, i64 1
+  %tmp15253 = getelementptr inbounds float* %tmp15252, i64 1
+  %tmp15254 = getelementptr inbounds float* %tmp15253, i64 1
+  %tmp15255 = getelementptr inbounds float* %tmp15254, i64 1
+  %tmp15256 = getelementptr inbounds float* %tmp15255, i64 1
+  %tmp15257 = getelementptr inbounds float* %tmp15256, i64 1
+  %tmp15258 = getelementptr inbounds float* %tmp15257, i64 1
+  %tmp15259 = getelementptr inbounds float* %tmp15258, i64 1
+  %tmp15260 = getelementptr inbounds float* %tmp15259, i64 1
+  %tmp15261 = getelementptr inbounds float* %tmp15260, i64 1
+  %tmp15262 = getelementptr inbounds float* %tmp15261, i64 1
+  %tmp15263 = getelementptr inbounds float* %tmp15262, i64 1
+  %tmp15264 = getelementptr inbounds float* %tmp15263, i64 1
+  %tmp15265 = getelementptr inbounds float* %tmp15264, i64 1
+  %tmp15266 = getelementptr inbounds float* %tmp15265, i64 1
+  %tmp15267 = getelementptr inbounds float* %tmp15266, i64 1
+  %tmp15268 = getelementptr inbounds float* %tmp15267, i64 1
+  %tmp15269 = getelementptr inbounds float* %tmp15268, i64 1
+  %tmp15270 = getelementptr inbounds float* %tmp15269, i64 1
+  %tmp15271 = getelementptr inbounds float* %tmp15270, i64 1
+  %tmp15272 = getelementptr inbounds float* %tmp15271, i64 1
+  %tmp15273 = getelementptr inbounds float* %tmp15272, i64 1
+  %tmp15274 = getelementptr inbounds float* %tmp15273, i64 1
+  %tmp15275 = getelementptr inbounds float* %tmp15274, i64 1
+  %tmp15276 = getelementptr inbounds float* %tmp15275, i64 1
+  %tmp15277 = getelementptr inbounds float* %tmp15276, i64 1
+  %tmp15278 = getelementptr inbounds float* %tmp15277, i64 1
+  %tmp15279 = getelementptr inbounds float* %tmp15278, i64 1
+  %tmp15280 = getelementptr inbounds float* %tmp15279, i64 1
+  %tmp15281 = getelementptr inbounds float* %tmp15280, i64 1
+  %tmp15282 = getelementptr inbounds float* %tmp15281, i64 1
+  %tmp15283 = getelementptr inbounds float* %tmp15282, i64 1
+  %tmp15284 = getelementptr inbounds float* %tmp15283, i64 1
+  %tmp15285 = getelementptr inbounds float* %tmp15284, i64 1
+  %tmp15286 = getelementptr inbounds float* %tmp15285, i64 1
+  %tmp15287 = getelementptr inbounds float* %tmp15286, i64 1
+  %tmp15288 = getelementptr inbounds float* %tmp15287, i64 1
+  %tmp15289 = getelementptr inbounds float* %tmp15288, i64 1
+  %tmp15290 = getelementptr inbounds float* %tmp15289, i64 1
+  %tmp15291 = getelementptr inbounds float* %tmp15290, i64 1
+  %tmp15292 = getelementptr inbounds float* %tmp15291, i64 1
+  %tmp15293 = getelementptr inbounds float* %tmp15292, i64 1
+  %tmp15294 = getelementptr inbounds float* %tmp15293, i64 1
+  %tmp15295 = getelementptr inbounds float* %tmp15294, i64 1
+  %tmp15296 = getelementptr inbounds float* %tmp15295, i64 1
+  %tmp15297 = getelementptr inbounds float* %tmp15296, i64 1
+  %tmp15298 = getelementptr inbounds float* %tmp15297, i64 1
+  %tmp15299 = getelementptr inbounds float* %tmp15298, i64 1
+  %tmp15300 = getelementptr inbounds float* %tmp15299, i64 1
+  %tmp15301 = getelementptr inbounds float* %tmp15300, i64 1
+  %tmp15302 = getelementptr inbounds float* %tmp15301, i64 1
+  %tmp15303 = getelementptr inbounds float* %tmp15302, i64 1
+  %tmp15304 = getelementptr inbounds float* %tmp15303, i64 1
+  %tmp15305 = getelementptr inbounds float* %tmp15304, i64 1
+  %tmp15306 = getelementptr inbounds float* %tmp15305, i64 1
+  %tmp15307 = getelementptr inbounds float* %tmp15306, i64 1
+  %tmp15308 = getelementptr inbounds float* %tmp15307, i64 1
+  %tmp15309 = getelementptr inbounds float* %tmp15308, i64 1
+  %tmp15310 = getelementptr inbounds float* %tmp15309, i64 1
+  %tmp15311 = getelementptr inbounds float* %tmp15310, i64 1
+  %tmp15312 = getelementptr inbounds float* %tmp15311, i64 1
+  %tmp15313 = getelementptr inbounds float* %tmp15312, i64 1
+  %tmp15314 = getelementptr inbounds float* %tmp15313, i64 1
+  %tmp15315 = getelementptr inbounds float* %tmp15314, i64 1
+  %tmp15316 = getelementptr inbounds float* %tmp15315, i64 1
+  %tmp15317 = getelementptr inbounds float* %tmp15316, i64 1
+  %tmp15318 = getelementptr inbounds float* %tmp15317, i64 1
+  %tmp15319 = getelementptr inbounds float* %tmp15318, i64 1
+  %tmp15320 = getelementptr inbounds float* %tmp15319, i64 1
+  %tmp15321 = getelementptr inbounds float* %tmp15320, i64 1
+  %tmp15322 = getelementptr inbounds float* %tmp15321, i64 1
+  %tmp15323 = getelementptr inbounds float* %tmp15322, i64 1
+  %tmp15324 = getelementptr inbounds float* %tmp15323, i64 1
+  %tmp15325 = getelementptr inbounds float* %tmp15324, i64 1
+  %tmp15326 = getelementptr inbounds float* %tmp15325, i64 1
+  %tmp15327 = getelementptr inbounds float* %tmp15326, i64 1
+  %tmp15328 = getelementptr inbounds float* %tmp15327, i64 1
+  %tmp15329 = getelementptr inbounds float* %tmp15328, i64 1
+  %tmp15330 = getelementptr inbounds float* %tmp15329, i64 1
+  %tmp15331 = getelementptr inbounds float* %tmp15330, i64 1
+  %tmp15332 = getelementptr inbounds float* %tmp15331, i64 1
+  %tmp15333 = getelementptr inbounds float* %tmp15332, i64 1
+  %tmp15334 = getelementptr inbounds float* %tmp15333, i64 1
+  %tmp15335 = getelementptr inbounds float* %tmp15334, i64 1
+  %tmp15336 = getelementptr inbounds float* %tmp15335, i64 1
+  %tmp15337 = getelementptr inbounds float* %tmp15336, i64 1
+  %tmp15338 = getelementptr inbounds float* %tmp15337, i64 1
+  %tmp15339 = getelementptr inbounds float* %tmp15338, i64 1
+  %tmp15340 = getelementptr inbounds float* %tmp15339, i64 1
+  %tmp15341 = getelementptr inbounds float* %tmp15340, i64 1
+  %tmp15342 = getelementptr inbounds float* %tmp15341, i64 1
+  %tmp15343 = getelementptr inbounds float* %tmp15342, i64 1
+  %tmp15344 = getelementptr inbounds float* %tmp15343, i64 1
+  %tmp15345 = getelementptr inbounds float* %tmp15344, i64 1
+  %tmp15346 = getelementptr inbounds float* %tmp15345, i64 1
+  %tmp15347 = getelementptr inbounds float* %tmp15346, i64 1
+  %tmp15348 = getelementptr inbounds float* %tmp15347, i64 1
+  %tmp15349 = getelementptr inbounds float* %tmp15348, i64 1
+  %tmp15350 = getelementptr inbounds float* %tmp15349, i64 1
+  %tmp15351 = getelementptr inbounds float* %tmp15350, i64 1
+  %tmp15352 = getelementptr inbounds float* %tmp15351, i64 1
+  %tmp15353 = getelementptr inbounds float* %tmp15352, i64 1
+  %tmp15354 = getelementptr inbounds float* %tmp15353, i64 1
+  %tmp15355 = getelementptr inbounds float* %tmp15354, i64 1
+  %tmp15356 = getelementptr inbounds float* %tmp15355, i64 1
+  %tmp15357 = getelementptr inbounds float* %tmp15356, i64 1
+  %tmp15358 = getelementptr inbounds float* %tmp15357, i64 1
+  %tmp15359 = getelementptr inbounds float* %tmp15358, i64 1
+  %tmp15360 = getelementptr inbounds float* %tmp15359, i64 1
+  %tmp15361 = getelementptr inbounds float* %tmp15360, i64 1
+  %tmp15362 = getelementptr inbounds float* %tmp15361, i64 1
+  %tmp15363 = getelementptr inbounds float* %tmp15362, i64 1
+  %tmp15364 = getelementptr inbounds float* %tmp15363, i64 1
+  %tmp15365 = getelementptr inbounds float* %tmp15364, i64 1
+  %tmp15366 = getelementptr inbounds float* %tmp15365, i64 1
+  %tmp15367 = getelementptr inbounds float* %tmp15366, i64 1
+  %tmp15368 = getelementptr inbounds float* %tmp15367, i64 1
+  %tmp15369 = getelementptr inbounds float* %tmp15368, i64 1
+  %tmp15370 = getelementptr inbounds float* %tmp15369, i64 1
+  %tmp15371 = getelementptr inbounds float* %tmp15370, i64 1
+  %tmp15372 = getelementptr inbounds float* %tmp15371, i64 1
+  %tmp15373 = getelementptr inbounds float* %tmp15372, i64 1
+  %tmp15374 = getelementptr inbounds float* %tmp15373, i64 1
+  %tmp15375 = getelementptr inbounds float* %tmp15374, i64 1
+  %tmp15376 = getelementptr inbounds float* %tmp15375, i64 1
+  %tmp15377 = getelementptr inbounds float* %tmp15376, i64 1
+  %tmp15378 = getelementptr inbounds float* %tmp15377, i64 1
+  %tmp15379 = getelementptr inbounds float* %tmp15378, i64 1
+  %tmp15380 = getelementptr inbounds float* %tmp15379, i64 1
+  %tmp15381 = getelementptr inbounds float* %tmp15380, i64 1
+  %tmp15382 = getelementptr inbounds float* %tmp15381, i64 1
+  %tmp15383 = getelementptr inbounds float* %tmp15382, i64 1
+  %tmp15384 = getelementptr inbounds float* %tmp15383, i64 1
+  %tmp15385 = getelementptr inbounds float* %tmp15384, i64 1
+  %tmp15386 = getelementptr inbounds float* %tmp15385, i64 1
+  %tmp15387 = getelementptr inbounds float* %tmp15386, i64 1
+  %tmp15388 = getelementptr inbounds float* %tmp15387, i64 1
+  %tmp15389 = getelementptr inbounds float* %tmp15388, i64 1
+  %tmp15390 = getelementptr inbounds float* %tmp15389, i64 1
+  %tmp15391 = getelementptr inbounds float* %tmp15390, i64 1
+  %tmp15392 = getelementptr inbounds float* %tmp15391, i64 1
+  %tmp15393 = getelementptr inbounds float* %tmp15392, i64 1
+  %tmp15394 = getelementptr inbounds float* %tmp15393, i64 1
+  %tmp15395 = getelementptr inbounds float* %tmp15394, i64 1
+  %tmp15396 = getelementptr inbounds float* %tmp15395, i64 1
+  %tmp15397 = getelementptr inbounds float* %tmp15396, i64 1
+  %tmp15398 = getelementptr inbounds float* %tmp15397, i64 1
+  %tmp15399 = getelementptr inbounds float* %tmp15398, i64 1
+  %tmp15400 = getelementptr inbounds float* %tmp15399, i64 1
+  %tmp15401 = getelementptr inbounds float* %tmp15400, i64 1
+  %tmp15402 = getelementptr inbounds float* %tmp15401, i64 1
+  %tmp15403 = getelementptr inbounds float* %tmp15402, i64 1
+  %tmp15404 = getelementptr inbounds float* %tmp15403, i64 1
+  %tmp15405 = getelementptr inbounds float* %tmp15404, i64 1
+  %tmp15406 = getelementptr inbounds float* %tmp15405, i64 1
+  %tmp15407 = getelementptr inbounds float* %tmp15406, i64 1
+  %tmp15408 = getelementptr inbounds float* %tmp15407, i64 1
+  %tmp15409 = getelementptr inbounds float* %tmp15408, i64 1
+  %tmp15410 = getelementptr inbounds float* %tmp15409, i64 1
+  %tmp15411 = getelementptr inbounds float* %tmp15410, i64 1
+  %tmp15412 = getelementptr inbounds float* %tmp15411, i64 1
+  %tmp15413 = getelementptr inbounds float* %tmp15412, i64 1
+  %tmp15414 = getelementptr inbounds float* %tmp15413, i64 1
+  %tmp15415 = getelementptr inbounds float* %tmp15414, i64 1
+  %tmp15416 = getelementptr inbounds float* %tmp15415, i64 1
+  %tmp15417 = getelementptr inbounds float* %tmp15416, i64 1
+  %tmp15418 = getelementptr inbounds float* %tmp15417, i64 1
+  %tmp15419 = getelementptr inbounds float* %tmp15418, i64 1
+  %tmp15420 = getelementptr inbounds float* %tmp15419, i64 1
+  %tmp15421 = getelementptr inbounds float* %tmp15420, i64 1
+  %tmp15422 = getelementptr inbounds float* %tmp15421, i64 1
+  %tmp15423 = getelementptr inbounds float* %tmp15422, i64 1
+  %tmp15424 = getelementptr inbounds float* %tmp15423, i64 1
+  %tmp15425 = getelementptr inbounds float* %tmp15424, i64 1
+  %tmp15426 = getelementptr inbounds float* %tmp15425, i64 1
+  %tmp15427 = getelementptr inbounds float* %tmp15426, i64 1
+  %tmp15428 = getelementptr inbounds float* %tmp15427, i64 1
+  %tmp15429 = getelementptr inbounds float* %tmp15428, i64 1
+  %tmp15430 = getelementptr inbounds float* %tmp15429, i64 1
+  %tmp15431 = getelementptr inbounds float* %tmp15430, i64 1
+  %tmp15432 = getelementptr inbounds float* %tmp15431, i64 1
+  %tmp15433 = getelementptr inbounds float* %tmp15432, i64 1
+  %tmp15434 = getelementptr inbounds float* %tmp15433, i64 1
+  %tmp15435 = getelementptr inbounds float* %tmp15434, i64 1
+  %tmp15436 = getelementptr inbounds float* %tmp15435, i64 1
+  %tmp15437 = getelementptr inbounds float* %tmp15436, i64 1
+  %tmp15438 = getelementptr inbounds float* %tmp15437, i64 1
+  %tmp15439 = getelementptr inbounds float* %tmp15438, i64 1
+  %tmp15440 = getelementptr inbounds float* %tmp15439, i64 1
+  %tmp15441 = getelementptr inbounds float* %tmp15440, i64 1
+  %tmp15442 = getelementptr inbounds float* %tmp15441, i64 1
+  %tmp15443 = getelementptr inbounds float* %tmp15442, i64 1
+  %tmp15444 = getelementptr inbounds float* %tmp15443, i64 1
+  %tmp15445 = getelementptr inbounds float* %tmp15444, i64 1
+  %tmp15446 = getelementptr inbounds float* %tmp15445, i64 1
+  %tmp15447 = getelementptr inbounds float* %tmp15446, i64 1
+  %tmp15448 = getelementptr inbounds float* %tmp15447, i64 1
+  %tmp15449 = getelementptr inbounds float* %tmp15448, i64 1
+  %tmp15450 = getelementptr inbounds float* %tmp15449, i64 1
+  %tmp15451 = getelementptr inbounds float* %tmp15450, i64 1
+  %tmp15452 = getelementptr inbounds float* %tmp15451, i64 1
+  %tmp15453 = getelementptr inbounds float* %tmp15452, i64 1
+  %tmp15454 = getelementptr inbounds float* %tmp15453, i64 1
+  %tmp15455 = getelementptr inbounds float* %tmp15454, i64 1
+  %tmp15456 = getelementptr inbounds float* %tmp15455, i64 1
+  %tmp15457 = getelementptr inbounds float* %tmp15456, i64 1
+  %tmp15458 = getelementptr inbounds float* %tmp15457, i64 1
+  %tmp15459 = getelementptr inbounds float* %tmp15458, i64 1
+  %tmp15460 = getelementptr inbounds float* %tmp15459, i64 1
+  %tmp15461 = getelementptr inbounds float* %tmp15460, i64 1
+  %tmp15462 = getelementptr inbounds float* %tmp15461, i64 1
+  %tmp15463 = getelementptr inbounds float* %tmp15462, i64 1
+  %tmp15464 = getelementptr inbounds float* %tmp15463, i64 1
+  %tmp15465 = getelementptr inbounds float* %tmp15464, i64 1
+  %tmp15466 = getelementptr inbounds float* %tmp15465, i64 1
+  %tmp15467 = getelementptr inbounds float* %tmp15466, i64 1
+  %tmp15468 = getelementptr inbounds float* %tmp15467, i64 1
+  %tmp15469 = getelementptr inbounds float* %tmp15468, i64 1
+  %tmp15470 = getelementptr inbounds float* %tmp15469, i64 1
+  %tmp15471 = getelementptr inbounds float* %tmp15470, i64 1
+  %tmp15472 = getelementptr inbounds float* %tmp15471, i64 1
+  %tmp15473 = getelementptr inbounds float* %tmp15472, i64 1
+  %tmp15474 = getelementptr inbounds float* %tmp15473, i64 1
+  %tmp15475 = getelementptr inbounds float* %tmp15474, i64 1
+  %tmp15476 = getelementptr inbounds float* %tmp15475, i64 1
+  %tmp15477 = getelementptr inbounds float* %tmp15476, i64 1
+  %tmp15478 = getelementptr inbounds float* %tmp15477, i64 1
+  %tmp15479 = getelementptr inbounds float* %tmp15478, i64 1
+  %tmp15480 = getelementptr inbounds float* %tmp15479, i64 1
+  %tmp15481 = getelementptr inbounds float* %tmp15480, i64 1
+  %tmp15482 = getelementptr inbounds float* %tmp15481, i64 1
+  %tmp15483 = getelementptr inbounds float* %tmp15482, i64 1
+  %tmp15484 = getelementptr inbounds float* %tmp15483, i64 1
+  %tmp15485 = getelementptr inbounds float* %tmp15484, i64 1
+  %tmp15486 = getelementptr inbounds float* %tmp15485, i64 1
+  %tmp15487 = getelementptr inbounds float* %tmp15486, i64 1
+  %tmp15488 = getelementptr inbounds float* %tmp15487, i64 1
+  %tmp15489 = getelementptr inbounds float* %tmp15488, i64 1
+  %tmp15490 = getelementptr inbounds float* %tmp15489, i64 1
+  %tmp15491 = getelementptr inbounds float* %tmp15490, i64 1
+  %tmp15492 = getelementptr inbounds float* %tmp15491, i64 1
+  %tmp15493 = getelementptr inbounds float* %tmp15492, i64 1
+  %tmp15494 = getelementptr inbounds float* %tmp15493, i64 1
+  %tmp15495 = getelementptr inbounds float* %tmp15494, i64 1
+  %tmp15496 = getelementptr inbounds float* %tmp15495, i64 1
+  %tmp15497 = getelementptr inbounds float* %tmp15496, i64 1
+  %tmp15498 = getelementptr inbounds float* %tmp15497, i64 1
+  %tmp15499 = getelementptr inbounds float* %tmp15498, i64 1
+  %tmp15500 = getelementptr inbounds float* %tmp15499, i64 1
+  %tmp15501 = getelementptr inbounds float* %tmp15500, i64 1
+  %tmp15502 = getelementptr inbounds float* %tmp15501, i64 1
+  %tmp15503 = getelementptr inbounds float* %tmp15502, i64 1
+  %tmp15504 = getelementptr inbounds float* %tmp15503, i64 1
+  %tmp15505 = getelementptr inbounds float* %tmp15504, i64 1
+  %tmp15506 = getelementptr inbounds float* %tmp15505, i64 1
+  %tmp15507 = getelementptr inbounds float* %tmp15506, i64 1
+  %tmp15508 = getelementptr inbounds float* %tmp15507, i64 1
+  %tmp15509 = getelementptr inbounds float* %tmp15508, i64 1
+  %tmp15510 = getelementptr inbounds float* %tmp15509, i64 1
+  %tmp15511 = getelementptr inbounds float* %tmp15510, i64 1
+  %tmp15512 = getelementptr inbounds float* %tmp15511, i64 1
+  %tmp15513 = getelementptr inbounds float* %tmp15512, i64 1
+  %tmp15514 = getelementptr inbounds float* %tmp15513, i64 1
+  %tmp15515 = getelementptr inbounds float* %tmp15514, i64 1
+  %tmp15516 = getelementptr inbounds float* %tmp15515, i64 1
+  %tmp15517 = getelementptr inbounds float* %tmp15516, i64 1
+  %tmp15518 = getelementptr inbounds float* %tmp15517, i64 1
+  %tmp15519 = getelementptr inbounds float* %tmp15518, i64 1
+  %tmp15520 = getelementptr inbounds float* %tmp15519, i64 1
+  %tmp15521 = getelementptr inbounds float* %tmp15520, i64 1
+  %tmp15522 = getelementptr inbounds float* %tmp15521, i64 1
+  %tmp15523 = getelementptr inbounds float* %tmp15522, i64 1
+  %tmp15524 = getelementptr inbounds float* %tmp15523, i64 1
+  %tmp15525 = getelementptr inbounds float* %tmp15524, i64 1
+  %tmp15526 = getelementptr inbounds float* %tmp15525, i64 1
+  %tmp15527 = getelementptr inbounds float* %tmp15526, i64 1
+  %tmp15528 = getelementptr inbounds float* %tmp15527, i64 1
+  %tmp15529 = getelementptr inbounds float* %tmp15528, i64 1
+  %tmp15530 = getelementptr inbounds float* %tmp15529, i64 1
+  %tmp15531 = getelementptr inbounds float* %tmp15530, i64 1
+  %tmp15532 = getelementptr inbounds float* %tmp15531, i64 1
+  %tmp15533 = getelementptr inbounds float* %tmp15532, i64 1
+  %tmp15534 = getelementptr inbounds float* %tmp15533, i64 1
+  %tmp15535 = getelementptr inbounds float* %tmp15534, i64 1
+  %tmp15536 = getelementptr inbounds float* %tmp15535, i64 1
+  %tmp15537 = getelementptr inbounds float* %tmp15536, i64 1
+  %tmp15538 = getelementptr inbounds float* %tmp15537, i64 1
+  %tmp15539 = getelementptr inbounds float* %tmp15538, i64 1
+  %tmp15540 = getelementptr inbounds float* %tmp15539, i64 1
+  %tmp15541 = getelementptr inbounds float* %tmp15540, i64 1
+  %tmp15542 = getelementptr inbounds float* %tmp15541, i64 1
+  %tmp15543 = getelementptr inbounds float* %tmp15542, i64 1
+  %tmp15544 = getelementptr inbounds float* %tmp15543, i64 1
+  %tmp15545 = getelementptr inbounds float* %tmp15544, i64 1
+  %tmp15546 = getelementptr inbounds float* %tmp15545, i64 1
+  %tmp15547 = getelementptr inbounds float* %tmp15546, i64 1
+  %tmp15548 = getelementptr inbounds float* %tmp15547, i64 1
+  %tmp15549 = getelementptr inbounds float* %tmp15548, i64 1
+  %tmp15550 = getelementptr inbounds float* %tmp15549, i64 1
+  %tmp15551 = getelementptr inbounds float* %tmp15550, i64 1
+  %tmp15552 = getelementptr inbounds float* %tmp15551, i64 1
+  %tmp15553 = getelementptr inbounds float* %tmp15552, i64 1
+  %tmp15554 = getelementptr inbounds float* %tmp15553, i64 1
+  %tmp15555 = getelementptr inbounds float* %tmp15554, i64 1
+  %tmp15556 = getelementptr inbounds float* %tmp15555, i64 1
+  %tmp15557 = getelementptr inbounds float* %tmp15556, i64 1
+  %tmp15558 = getelementptr inbounds float* %tmp15557, i64 1
+  %tmp15559 = getelementptr inbounds float* %tmp15558, i64 1
+  %tmp15560 = getelementptr inbounds float* %tmp15559, i64 1
+  %tmp15561 = getelementptr inbounds float* %tmp15560, i64 1
+  %tmp15562 = getelementptr inbounds float* %tmp15561, i64 1
+  %tmp15563 = getelementptr inbounds float* %tmp15562, i64 1
+  %tmp15564 = getelementptr inbounds float* %tmp15563, i64 1
+  %tmp15565 = getelementptr inbounds float* %tmp15564, i64 1
+  %tmp15566 = getelementptr inbounds float* %tmp15565, i64 1
+  %tmp15567 = getelementptr inbounds float* %tmp15566, i64 1
+  %tmp15568 = getelementptr inbounds float* %tmp15567, i64 1
+  %tmp15569 = getelementptr inbounds float* %tmp15568, i64 1
+  %tmp15570 = getelementptr inbounds float* %tmp15569, i64 1
+  %tmp15571 = getelementptr inbounds float* %tmp15570, i64 1
+  %tmp15572 = getelementptr inbounds float* %tmp15571, i64 1
+  %tmp15573 = getelementptr inbounds float* %tmp15572, i64 1
+  %tmp15574 = getelementptr inbounds float* %tmp15573, i64 1
+  %tmp15575 = getelementptr inbounds float* %tmp15574, i64 1
+  %tmp15576 = getelementptr inbounds float* %tmp15575, i64 1
+  %tmp15577 = getelementptr inbounds float* %tmp15576, i64 1
+  %tmp15578 = getelementptr inbounds float* %tmp15577, i64 1
+  %tmp15579 = getelementptr inbounds float* %tmp15578, i64 1
+  %tmp15580 = getelementptr inbounds float* %tmp15579, i64 1
+  %tmp15581 = getelementptr inbounds float* %tmp15580, i64 1
+  %tmp15582 = getelementptr inbounds float* %tmp15581, i64 1
+  %tmp15583 = getelementptr inbounds float* %tmp15582, i64 1
+  %tmp15584 = getelementptr inbounds float* %tmp15583, i64 1
+  %tmp15585 = getelementptr inbounds float* %tmp15584, i64 1
+  %tmp15586 = getelementptr inbounds float* %tmp15585, i64 1
+  %tmp15587 = getelementptr inbounds float* %tmp15586, i64 1
+  %tmp15588 = getelementptr inbounds float* %tmp15587, i64 1
+  %tmp15589 = getelementptr inbounds float* %tmp15588, i64 1
+  %tmp15590 = getelementptr inbounds float* %tmp15589, i64 1
+  %tmp15591 = getelementptr inbounds float* %tmp15590, i64 1
+  %tmp15592 = getelementptr inbounds float* %tmp15591, i64 1
+  %tmp15593 = getelementptr inbounds float* %tmp15592, i64 1
+  %tmp15594 = getelementptr inbounds float* %tmp15593, i64 1
+  %tmp15595 = getelementptr inbounds float* %tmp15594, i64 1
+  %tmp15596 = getelementptr inbounds float* %tmp15595, i64 1
+  %tmp15597 = getelementptr inbounds float* %tmp15596, i64 1
+  %tmp15598 = getelementptr inbounds float* %tmp15597, i64 1
+  %tmp15599 = getelementptr inbounds float* %tmp15598, i64 1
+  %tmp15600 = getelementptr inbounds float* %tmp15599, i64 1
+  %tmp15601 = getelementptr inbounds float* %tmp15600, i64 1
+  %tmp15602 = getelementptr inbounds float* %tmp15601, i64 1
+  %tmp15603 = getelementptr inbounds float* %tmp15602, i64 1
+  %tmp15604 = getelementptr inbounds float* %tmp15603, i64 1
+  %tmp15605 = getelementptr inbounds float* %tmp15604, i64 1
+  %tmp15606 = getelementptr inbounds float* %tmp15605, i64 1
+  %tmp15607 = getelementptr inbounds float* %tmp15606, i64 1
+  %tmp15608 = getelementptr inbounds float* %tmp15607, i64 1
+  %tmp15609 = getelementptr inbounds float* %tmp15608, i64 1
+  %tmp15610 = getelementptr inbounds float* %tmp15609, i64 1
+  %tmp15611 = getelementptr inbounds float* %tmp15610, i64 1
+  %tmp15612 = getelementptr inbounds float* %tmp15611, i64 1
+  %tmp15613 = getelementptr inbounds float* %tmp15612, i64 1
+  %tmp15614 = getelementptr inbounds float* %tmp15613, i64 1
+  %tmp15615 = getelementptr inbounds float* %tmp15614, i64 1
+  %tmp15616 = getelementptr inbounds float* %tmp15615, i64 1
+  %tmp15617 = getelementptr inbounds float* %tmp15616, i64 1
+  %tmp15618 = getelementptr inbounds float* %tmp15617, i64 1
+  %tmp15619 = getelementptr inbounds float* %tmp15618, i64 1
+  %tmp15620 = getelementptr inbounds float* %tmp15619, i64 1
+  %tmp15621 = getelementptr inbounds float* %tmp15620, i64 1
+  %tmp15622 = getelementptr inbounds float* %tmp15621, i64 1
+  %tmp15623 = getelementptr inbounds float* %tmp15622, i64 1
+  %tmp15624 = getelementptr inbounds float* %tmp15623, i64 1
+  %tmp15625 = getelementptr inbounds float* %tmp15624, i64 1
+  %tmp15626 = getelementptr inbounds float* %tmp15625, i64 1
+  %tmp15627 = getelementptr inbounds float* %tmp15626, i64 1
+  %tmp15628 = getelementptr inbounds float* %tmp15627, i64 1
+  %tmp15629 = getelementptr inbounds float* %tmp15628, i64 1
+  %tmp15630 = getelementptr inbounds float* %tmp15629, i64 1
+  %tmp15631 = getelementptr inbounds float* %tmp15630, i64 1
+  %tmp15632 = getelementptr inbounds float* %tmp15631, i64 1
+  %tmp15633 = getelementptr inbounds float* %tmp15632, i64 1
+  %tmp15634 = getelementptr inbounds float* %tmp15633, i64 1
+  %tmp15635 = getelementptr inbounds float* %tmp15634, i64 1
+  %tmp15636 = getelementptr inbounds float* %tmp15635, i64 1
+  %tmp15637 = getelementptr inbounds float* %tmp15636, i64 1
+  %tmp15638 = getelementptr inbounds float* %tmp15637, i64 1
+  %tmp15639 = getelementptr inbounds float* %tmp15638, i64 1
+  %tmp15640 = getelementptr inbounds float* %tmp15639, i64 1
+  %tmp15641 = getelementptr inbounds float* %tmp15640, i64 1
+  %tmp15642 = getelementptr inbounds float* %tmp15641, i64 1
+  %tmp15643 = getelementptr inbounds float* %tmp15642, i64 1
+  %tmp15644 = getelementptr inbounds float* %tmp15643, i64 1
+  %tmp15645 = getelementptr inbounds float* %tmp15644, i64 1
+  %tmp15646 = getelementptr inbounds float* %tmp15645, i64 1
+  %tmp15647 = getelementptr inbounds float* %tmp15646, i64 1
+  %tmp15648 = getelementptr inbounds float* %tmp15647, i64 1
+  %tmp15649 = getelementptr inbounds float* %tmp15648, i64 1
+  %tmp15650 = getelementptr inbounds float* %tmp15649, i64 1
+  %tmp15651 = getelementptr inbounds float* %tmp15650, i64 1
+  %tmp15652 = getelementptr inbounds float* %tmp15651, i64 1
+  %tmp15653 = getelementptr inbounds float* %tmp15652, i64 1
+  %tmp15654 = getelementptr inbounds float* %tmp15653, i64 1
+  %tmp15655 = getelementptr inbounds float* %tmp15654, i64 1
+  %tmp15656 = getelementptr inbounds float* %tmp15655, i64 1
+  %tmp15657 = getelementptr inbounds float* %tmp15656, i64 1
+  %tmp15658 = getelementptr inbounds float* %tmp15657, i64 1
+  %tmp15659 = getelementptr inbounds float* %tmp15658, i64 1
+  %tmp15660 = getelementptr inbounds float* %tmp15659, i64 1
+  %tmp15661 = getelementptr inbounds float* %tmp15660, i64 1
+  %tmp15662 = getelementptr inbounds float* %tmp15661, i64 1
+  %tmp15663 = getelementptr inbounds float* %tmp15662, i64 1
+  %tmp15664 = getelementptr inbounds float* %tmp15663, i64 1
+  %tmp15665 = getelementptr inbounds float* %tmp15664, i64 1
+  %tmp15666 = getelementptr inbounds float* %tmp15665, i64 1
+  %tmp15667 = getelementptr inbounds float* %tmp15666, i64 1
+  %tmp15668 = getelementptr inbounds float* %tmp15667, i64 1
+  %tmp15669 = getelementptr inbounds float* %tmp15668, i64 1
+  %tmp15670 = getelementptr inbounds float* %tmp15669, i64 1
+  %tmp15671 = getelementptr inbounds float* %tmp15670, i64 1
+  %tmp15672 = getelementptr inbounds float* %tmp15671, i64 1
+  %tmp15673 = getelementptr inbounds float* %tmp15672, i64 1
+  %tmp15674 = getelementptr inbounds float* %tmp15673, i64 1
+  %tmp15675 = getelementptr inbounds float* %tmp15674, i64 1
+  %tmp15676 = getelementptr inbounds float* %tmp15675, i64 1
+  %tmp15677 = getelementptr inbounds float* %tmp15676, i64 1
+  %tmp15678 = getelementptr inbounds float* %tmp15677, i64 1
+  %tmp15679 = getelementptr inbounds float* %tmp15678, i64 1
+  %tmp15680 = getelementptr inbounds float* %tmp15679, i64 1
+  %tmp15681 = getelementptr inbounds float* %tmp15680, i64 1
+  %tmp15682 = getelementptr inbounds float* %tmp15681, i64 1
+  %tmp15683 = getelementptr inbounds float* %tmp15682, i64 1
+  %tmp15684 = getelementptr inbounds float* %tmp15683, i64 1
+  %tmp15685 = getelementptr inbounds float* %tmp15684, i64 1
+  %tmp15686 = getelementptr inbounds float* %tmp15685, i64 1
+  %tmp15687 = getelementptr inbounds float* %tmp15686, i64 1
+  %tmp15688 = getelementptr inbounds float* %tmp15687, i64 1
+  %tmp15689 = getelementptr inbounds float* %tmp15688, i64 1
+  %tmp15690 = getelementptr inbounds float* %tmp15689, i64 1
+  %tmp15691 = getelementptr inbounds float* %tmp15690, i64 1
+  %tmp15692 = getelementptr inbounds float* %tmp15691, i64 1
+  %tmp15693 = getelementptr inbounds float* %tmp15692, i64 1
+  %tmp15694 = getelementptr inbounds float* %tmp15693, i64 1
+  %tmp15695 = getelementptr inbounds float* %tmp15694, i64 1
+  %tmp15696 = getelementptr inbounds float* %tmp15695, i64 1
+  %tmp15697 = getelementptr inbounds float* %tmp15696, i64 1
+  %tmp15698 = getelementptr inbounds float* %tmp15697, i64 1
+  %tmp15699 = getelementptr inbounds float* %tmp15698, i64 1
+  %tmp15700 = getelementptr inbounds float* %tmp15699, i64 1
+  %tmp15701 = getelementptr inbounds float* %tmp15700, i64 1
+  %tmp15702 = getelementptr inbounds float* %tmp15701, i64 1
+  %tmp15703 = getelementptr inbounds float* %tmp15702, i64 1
+  %tmp15704 = getelementptr inbounds float* %tmp15703, i64 1
+  %tmp15705 = getelementptr inbounds float* %tmp15704, i64 1
+  %tmp15706 = getelementptr inbounds float* %tmp15705, i64 1
+  %tmp15707 = getelementptr inbounds float* %tmp15706, i64 1
+  %tmp15708 = getelementptr inbounds float* %tmp15707, i64 1
+  %tmp15709 = getelementptr inbounds float* %tmp15708, i64 1
+  %tmp15710 = getelementptr inbounds float* %tmp15709, i64 1
+  %tmp15711 = getelementptr inbounds float* %tmp15710, i64 1
+  %tmp15712 = getelementptr inbounds float* %tmp15711, i64 1
+  %tmp15713 = getelementptr inbounds float* %tmp15712, i64 1
+  %tmp15714 = getelementptr inbounds float* %tmp15713, i64 1
+  %tmp15715 = getelementptr inbounds float* %tmp15714, i64 1
+  %tmp15716 = getelementptr inbounds float* %tmp15715, i64 1
+  %tmp15717 = getelementptr inbounds float* %tmp15716, i64 1
+  %tmp15718 = getelementptr inbounds float* %tmp15717, i64 1
+  %tmp15719 = getelementptr inbounds float* %tmp15718, i64 1
+  %tmp15720 = getelementptr inbounds float* %tmp15719, i64 1
+  %tmp15721 = getelementptr inbounds float* %tmp15720, i64 1
+  %tmp15722 = getelementptr inbounds float* %tmp15721, i64 1
+  %tmp15723 = getelementptr inbounds float* %tmp15722, i64 1
+  %tmp15724 = getelementptr inbounds float* %tmp15723, i64 1
+  %tmp15725 = getelementptr inbounds float* %tmp15724, i64 1
+  %tmp15726 = getelementptr inbounds float* %tmp15725, i64 1
+  %tmp15727 = getelementptr inbounds float* %tmp15726, i64 1
+  %tmp15728 = getelementptr inbounds float* %tmp15727, i64 1
+  %tmp15729 = getelementptr inbounds float* %tmp15728, i64 1
+  %tmp15730 = getelementptr inbounds float* %tmp15729, i64 1
+  %tmp15731 = getelementptr inbounds float* %tmp15730, i64 1
+  %tmp15732 = getelementptr inbounds float* %tmp15731, i64 1
+  %tmp15733 = getelementptr inbounds float* %tmp15732, i64 1
+  %tmp15734 = getelementptr inbounds float* %tmp15733, i64 1
+  %tmp15735 = getelementptr inbounds float* %tmp15734, i64 1
+  %tmp15736 = getelementptr inbounds float* %tmp15735, i64 1
+  %tmp15737 = getelementptr inbounds float* %tmp15736, i64 1
+  %tmp15738 = getelementptr inbounds float* %tmp15737, i64 1
+  %tmp15739 = getelementptr inbounds float* %tmp15738, i64 1
+  %tmp15740 = getelementptr inbounds float* %tmp15739, i64 1
+  %tmp15741 = getelementptr inbounds float* %tmp15740, i64 1
+  %tmp15742 = getelementptr inbounds float* %tmp15741, i64 1
+  %tmp15743 = getelementptr inbounds float* %tmp15742, i64 1
+  %tmp15744 = getelementptr inbounds float* %tmp15743, i64 1
+  %tmp15745 = getelementptr inbounds float* %tmp15744, i64 1
+  %tmp15746 = getelementptr inbounds float* %tmp15745, i64 1
+  %tmp15747 = getelementptr inbounds float* %tmp15746, i64 1
+  %tmp15748 = getelementptr inbounds float* %tmp15747, i64 1
+  %tmp15749 = getelementptr inbounds float* %tmp15748, i64 1
+  %tmp15750 = getelementptr inbounds float* %tmp15749, i64 1
+  %tmp15751 = getelementptr inbounds float* %tmp15750, i64 1
+  %tmp15752 = getelementptr inbounds float* %tmp15751, i64 1
+  %tmp15753 = getelementptr inbounds float* %tmp15752, i64 1
+  %tmp15754 = getelementptr inbounds float* %tmp15753, i64 1
+  %tmp15755 = getelementptr inbounds float* %tmp15754, i64 1
+  %tmp15756 = getelementptr inbounds float* %tmp15755, i64 1
+  %tmp15757 = getelementptr inbounds float* %tmp15756, i64 1
+  %tmp15758 = getelementptr inbounds float* %tmp15757, i64 1
+  %tmp15759 = getelementptr inbounds float* %tmp15758, i64 1
+  %tmp15760 = getelementptr inbounds float* %tmp15759, i64 1
+  %tmp15761 = getelementptr inbounds float* %tmp15760, i64 1
+  %tmp15762 = getelementptr inbounds float* %tmp15761, i64 1
+  %tmp15763 = getelementptr inbounds float* %tmp15762, i64 1
+  %tmp15764 = getelementptr inbounds float* %tmp15763, i64 1
+  %tmp15765 = getelementptr inbounds float* %tmp15764, i64 1
+  %tmp15766 = getelementptr inbounds float* %tmp15765, i64 1
+  %tmp15767 = getelementptr inbounds float* %tmp15766, i64 1
+  %tmp15768 = getelementptr inbounds float* %tmp15767, i64 1
+  %tmp15769 = getelementptr inbounds float* %tmp15768, i64 1
+  %tmp15770 = getelementptr inbounds float* %tmp15769, i64 1
+  %tmp15771 = getelementptr inbounds float* %tmp15770, i64 1
+  %tmp15772 = getelementptr inbounds float* %tmp15771, i64 1
+  %tmp15773 = getelementptr inbounds float* %tmp15772, i64 1
+  %tmp15774 = getelementptr inbounds float* %tmp15773, i64 1
+  %tmp15775 = getelementptr inbounds float* %tmp15774, i64 1
+  %tmp15776 = getelementptr inbounds float* %tmp15775, i64 1
+  %tmp15777 = getelementptr inbounds float* %tmp15776, i64 1
+  %tmp15778 = getelementptr inbounds float* %tmp15777, i64 1
+  %tmp15779 = getelementptr inbounds float* %tmp15778, i64 1
+  %tmp15780 = getelementptr inbounds float* %tmp15779, i64 1
+  %tmp15781 = getelementptr inbounds float* %tmp15780, i64 1
+  %tmp15782 = getelementptr inbounds float* %tmp15781, i64 1
+  %tmp15783 = getelementptr inbounds float* %tmp15782, i64 1
+  %tmp15784 = getelementptr inbounds float* %tmp15783, i64 1
+  %tmp15785 = getelementptr inbounds float* %tmp15784, i64 1
+  %tmp15786 = getelementptr inbounds float* %tmp15785, i64 1
+  %tmp15787 = getelementptr inbounds float* %tmp15786, i64 1
+  %tmp15788 = getelementptr inbounds float* %tmp15787, i64 1
+  %tmp15789 = getelementptr inbounds float* %tmp15788, i64 1
+  %tmp15790 = getelementptr inbounds float* %tmp15789, i64 1
+  %tmp15791 = getelementptr inbounds float* %tmp15790, i64 1
+  %tmp15792 = getelementptr inbounds float* %tmp15791, i64 1
+  %tmp15793 = getelementptr inbounds float* %tmp15792, i64 1
+  %tmp15794 = getelementptr inbounds float* %tmp15793, i64 1
+  %tmp15795 = getelementptr inbounds float* %tmp15794, i64 1
+  %tmp15796 = getelementptr inbounds float* %tmp15795, i64 1
+  %tmp15797 = getelementptr inbounds float* %tmp15796, i64 1
+  %tmp15798 = getelementptr inbounds float* %tmp15797, i64 1
+  %tmp15799 = getelementptr inbounds float* %tmp15798, i64 1
+  %tmp15800 = getelementptr inbounds float* %tmp15799, i64 1
+  %tmp15801 = getelementptr inbounds float* %tmp15800, i64 1
+  %tmp15802 = getelementptr inbounds float* %tmp15801, i64 1
+  %tmp15803 = getelementptr inbounds float* %tmp15802, i64 1
+  %tmp15804 = getelementptr inbounds float* %tmp15803, i64 1
+  %tmp15805 = getelementptr inbounds float* %tmp15804, i64 1
+  %tmp15806 = getelementptr inbounds float* %tmp15805, i64 1
+  %tmp15807 = getelementptr inbounds float* %tmp15806, i64 1
+  %tmp15808 = getelementptr inbounds float* %tmp15807, i64 1
+  %tmp15809 = getelementptr inbounds float* %tmp15808, i64 1
+  %tmp15810 = getelementptr inbounds float* %tmp15809, i64 1
+  %tmp15811 = getelementptr inbounds float* %tmp15810, i64 1
+  %tmp15812 = getelementptr inbounds float* %tmp15811, i64 1
+  %tmp15813 = getelementptr inbounds float* %tmp15812, i64 1
+  %tmp15814 = getelementptr inbounds float* %tmp15813, i64 1
+  %tmp15815 = getelementptr inbounds float* %tmp15814, i64 1
+  %tmp15816 = getelementptr inbounds float* %tmp15815, i64 1
+  %tmp15817 = getelementptr inbounds float* %tmp15816, i64 1
+  %tmp15818 = getelementptr inbounds float* %tmp15817, i64 1
+  %tmp15819 = getelementptr inbounds float* %tmp15818, i64 1
+  %tmp15820 = getelementptr inbounds float* %tmp15819, i64 1
+  %tmp15821 = getelementptr inbounds float* %tmp15820, i64 1
+  %tmp15822 = getelementptr inbounds float* %tmp15821, i64 1
+  %tmp15823 = getelementptr inbounds float* %tmp15822, i64 1
+  %tmp15824 = getelementptr inbounds float* %tmp15823, i64 1
+  %tmp15825 = getelementptr inbounds float* %tmp15824, i64 1
+  %tmp15826 = getelementptr inbounds float* %tmp15825, i64 1
+  %tmp15827 = getelementptr inbounds float* %tmp15826, i64 1
+  %tmp15828 = getelementptr inbounds float* %tmp15827, i64 1
+  %tmp15829 = getelementptr inbounds float* %tmp15828, i64 1
+  %tmp15830 = getelementptr inbounds float* %tmp15829, i64 1
+  %tmp15831 = getelementptr inbounds float* %tmp15830, i64 1
+  %tmp15832 = getelementptr inbounds float* %tmp15831, i64 1
+  %tmp15833 = getelementptr inbounds float* %tmp15832, i64 1
+  %tmp15834 = getelementptr inbounds float* %tmp15833, i64 1
+  %tmp15835 = getelementptr inbounds float* %tmp15834, i64 1
+  %tmp15836 = getelementptr inbounds float* %tmp15835, i64 1
+  %tmp15837 = getelementptr inbounds float* %tmp15836, i64 1
+  %tmp15838 = getelementptr inbounds float* %tmp15837, i64 1
+  %tmp15839 = getelementptr inbounds float* %tmp15838, i64 1
+  %tmp15840 = getelementptr inbounds float* %tmp15839, i64 1
+  %tmp15841 = getelementptr inbounds float* %tmp15840, i64 1
+  %tmp15842 = getelementptr inbounds float* %tmp15841, i64 1
+  %tmp15843 = getelementptr inbounds float* %tmp15842, i64 1
+  %tmp15844 = getelementptr inbounds float* %tmp15843, i64 1
+  %tmp15845 = getelementptr inbounds float* %tmp15844, i64 1
+  %tmp15846 = getelementptr inbounds float* %tmp15845, i64 1
+  %tmp15847 = getelementptr inbounds float* %tmp15846, i64 1
+  %tmp15848 = getelementptr inbounds float* %tmp15847, i64 1
+  %tmp15849 = getelementptr inbounds float* %tmp15848, i64 1
+  %tmp15850 = getelementptr inbounds float* %tmp15849, i64 1
+  %tmp15851 = getelementptr inbounds float* %tmp15850, i64 1
+  %tmp15852 = getelementptr inbounds float* %tmp15851, i64 1
+  %tmp15853 = getelementptr inbounds float* %tmp15852, i64 1
+  %tmp15854 = getelementptr inbounds float* %tmp15853, i64 1
+  %tmp15855 = getelementptr inbounds float* %tmp15854, i64 1
+  %tmp15856 = getelementptr inbounds float* %tmp15855, i64 1
+  %tmp15857 = getelementptr inbounds float* %tmp15856, i64 1
+  %tmp15858 = getelementptr inbounds float* %tmp15857, i64 1
+  %tmp15859 = getelementptr inbounds float* %tmp15858, i64 1
+  %tmp15860 = getelementptr inbounds float* %tmp15859, i64 1
+  %tmp15861 = getelementptr inbounds float* %tmp15860, i64 1
+  %tmp15862 = getelementptr inbounds float* %tmp15861, i64 1
+  %tmp15863 = getelementptr inbounds float* %tmp15862, i64 1
+  %tmp15864 = getelementptr inbounds float* %tmp15863, i64 1
+  %tmp15865 = getelementptr inbounds float* %tmp15864, i64 1
+  %tmp15866 = getelementptr inbounds float* %tmp15865, i64 1
+  %tmp15867 = getelementptr inbounds float* %tmp15866, i64 1
+  %tmp15868 = getelementptr inbounds float* %tmp15867, i64 1
+  %tmp15869 = getelementptr inbounds float* %tmp15868, i64 1
+  %tmp15870 = getelementptr inbounds float* %tmp15869, i64 1
+  %tmp15871 = getelementptr inbounds float* %tmp15870, i64 1
+  %tmp15872 = getelementptr inbounds float* %tmp15871, i64 1
+  %tmp15873 = getelementptr inbounds float* %tmp15872, i64 1
+  %tmp15874 = getelementptr inbounds float* %tmp15873, i64 1
+  %tmp15875 = getelementptr inbounds float* %tmp15874, i64 1
+  %tmp15876 = getelementptr inbounds float* %tmp15875, i64 1
+  %tmp15877 = getelementptr inbounds float* %tmp15876, i64 1
+  %tmp15878 = getelementptr inbounds float* %tmp15877, i64 1
+  %tmp15879 = getelementptr inbounds float* %tmp15878, i64 1
+  %tmp15880 = getelementptr inbounds float* %tmp15879, i64 1
+  %tmp15881 = getelementptr inbounds float* %tmp15880, i64 1
+  %tmp15882 = getelementptr inbounds float* %tmp15881, i64 1
+  %tmp15883 = getelementptr inbounds float* %tmp15882, i64 1
+  %tmp15884 = getelementptr inbounds float* %tmp15883, i64 1
+  %tmp15885 = getelementptr inbounds float* %tmp15884, i64 1
+  %tmp15886 = getelementptr inbounds float* %tmp15885, i64 1
+  %tmp15887 = getelementptr inbounds float* %tmp15886, i64 1
+  %tmp15888 = getelementptr inbounds float* %tmp15887, i64 1
+  %tmp15889 = getelementptr inbounds float* %tmp15888, i64 1
+  %tmp15890 = getelementptr inbounds float* %tmp15889, i64 1
+  %tmp15891 = getelementptr inbounds float* %tmp15890, i64 1
+  %tmp15892 = getelementptr inbounds float* %tmp15891, i64 1
+  %tmp15893 = getelementptr inbounds float* %tmp15892, i64 1
+  %tmp15894 = getelementptr inbounds float* %tmp15893, i64 1
+  %tmp15895 = getelementptr inbounds float* %tmp15894, i64 1
+  %tmp15896 = getelementptr inbounds float* %tmp15895, i64 1
+  %tmp15897 = getelementptr inbounds float* %tmp15896, i64 1
+  %tmp15898 = getelementptr inbounds float* %tmp15897, i64 1
+  %tmp15899 = getelementptr inbounds float* %tmp15898, i64 1
+  %tmp15900 = getelementptr inbounds float* %tmp15899, i64 1
+  %tmp15901 = getelementptr inbounds float* %tmp15900, i64 1
+  %tmp15902 = getelementptr inbounds float* %tmp15901, i64 1
+  %tmp15903 = getelementptr inbounds float* %tmp15902, i64 1
+  %tmp15904 = getelementptr inbounds float* %tmp15903, i64 1
+  %tmp15905 = getelementptr inbounds float* %tmp15904, i64 1
+  %tmp15906 = getelementptr inbounds float* %tmp15905, i64 1
+  %tmp15907 = getelementptr inbounds float* %tmp15906, i64 1
+  %tmp15908 = getelementptr inbounds float* %tmp15907, i64 1
+  %tmp15909 = getelementptr inbounds float* %tmp15908, i64 1
+  %tmp15910 = getelementptr inbounds float* %tmp15909, i64 1
+  %tmp15911 = getelementptr inbounds float* %tmp15910, i64 1
+  %tmp15912 = getelementptr inbounds float* %tmp15911, i64 1
+  %tmp15913 = getelementptr inbounds float* %tmp15912, i64 1
+  %tmp15914 = getelementptr inbounds float* %tmp15913, i64 1
+  %tmp15915 = getelementptr inbounds float* %tmp15914, i64 1
+  %tmp15916 = getelementptr inbounds float* %tmp15915, i64 1
+  %tmp15917 = getelementptr inbounds float* %tmp15916, i64 1
+  %tmp15918 = getelementptr inbounds float* %tmp15917, i64 1
+  %tmp15919 = getelementptr inbounds float* %tmp15918, i64 1
+  %tmp15920 = getelementptr inbounds float* %tmp15919, i64 1
+  %tmp15921 = getelementptr inbounds float* %tmp15920, i64 1
+  %tmp15922 = getelementptr inbounds float* %tmp15921, i64 1
+  %tmp15923 = getelementptr inbounds float* %tmp15922, i64 1
+  %tmp15924 = getelementptr inbounds float* %tmp15923, i64 1
+  %tmp15925 = getelementptr inbounds float* %tmp15924, i64 1
+  %tmp15926 = getelementptr inbounds float* %tmp15925, i64 1
+  %tmp15927 = getelementptr inbounds float* %tmp15926, i64 1
+  %tmp15928 = getelementptr inbounds float* %tmp15927, i64 1
+  %tmp15929 = getelementptr inbounds float* %tmp15928, i64 1
+  %tmp15930 = getelementptr inbounds float* %tmp15929, i64 1
+  %tmp15931 = getelementptr inbounds float* %tmp15930, i64 1
+  %tmp15932 = getelementptr inbounds float* %tmp15931, i64 1
+  %tmp15933 = getelementptr inbounds float* %tmp15932, i64 1
+  %tmp15934 = getelementptr inbounds float* %tmp15933, i64 1
+  %tmp15935 = getelementptr inbounds float* %tmp15934, i64 1
+  %tmp15936 = getelementptr inbounds float* %tmp15935, i64 1
+  %tmp15937 = getelementptr inbounds float* %tmp15936, i64 1
+  %tmp15938 = getelementptr inbounds float* %tmp15937, i64 1
+  %tmp15939 = getelementptr inbounds float* %tmp15938, i64 1
+  %tmp15940 = getelementptr inbounds float* %tmp15939, i64 1
+  %tmp15941 = getelementptr inbounds float* %tmp15940, i64 1
+  %tmp15942 = getelementptr inbounds float* %tmp15941, i64 1
+  %tmp15943 = getelementptr inbounds float* %tmp15942, i64 1
+  %tmp15944 = getelementptr inbounds float* %tmp15943, i64 1
+  %tmp15945 = getelementptr inbounds float* %tmp15944, i64 1
+  %tmp15946 = getelementptr inbounds float* %tmp15945, i64 1
+  %tmp15947 = getelementptr inbounds float* %tmp15946, i64 1
+  %tmp15948 = getelementptr inbounds float* %tmp15947, i64 1
+  %tmp15949 = getelementptr inbounds float* %tmp15948, i64 1
+  %tmp15950 = getelementptr inbounds float* %tmp15949, i64 1
+  %tmp15951 = getelementptr inbounds float* %tmp15950, i64 1
+  %tmp15952 = getelementptr inbounds float* %tmp15951, i64 1
+  %tmp15953 = getelementptr inbounds float* %tmp15952, i64 1
+  %tmp15954 = getelementptr inbounds float* %tmp15953, i64 1
+  %tmp15955 = getelementptr inbounds float* %tmp15954, i64 1
+  %tmp15956 = getelementptr inbounds float* %tmp15955, i64 1
+  %tmp15957 = getelementptr inbounds float* %tmp15956, i64 1
+  %tmp15958 = getelementptr inbounds float* %tmp15957, i64 1
+  %tmp15959 = getelementptr inbounds float* %tmp15958, i64 1
+  %tmp15960 = getelementptr inbounds float* %tmp15959, i64 1
+  %tmp15961 = getelementptr inbounds float* %tmp15960, i64 1
+  %tmp15962 = getelementptr inbounds float* %tmp15961, i64 1
+  %tmp15963 = getelementptr inbounds float* %tmp15962, i64 1
+  %tmp15964 = getelementptr inbounds float* %tmp15963, i64 1
+  %tmp15965 = getelementptr inbounds float* %tmp15964, i64 1
+  %tmp15966 = getelementptr inbounds float* %tmp15965, i64 1
+  %tmp15967 = getelementptr inbounds float* %tmp15966, i64 1
+  %tmp15968 = getelementptr inbounds float* %tmp15967, i64 1
+  %tmp15969 = getelementptr inbounds float* %tmp15968, i64 1
+  %tmp15970 = getelementptr inbounds float* %tmp15969, i64 1
+  %tmp15971 = getelementptr inbounds float* %tmp15970, i64 1
+  %tmp15972 = getelementptr inbounds float* %tmp15971, i64 1
+  %tmp15973 = getelementptr inbounds float* %tmp15972, i64 1
+  %tmp15974 = getelementptr inbounds float* %tmp15973, i64 1
+  %tmp15975 = getelementptr inbounds float* %tmp15974, i64 1
+  %tmp15976 = getelementptr inbounds float* %tmp15975, i64 1
+  %tmp15977 = getelementptr inbounds float* %tmp15976, i64 1
+  %tmp15978 = getelementptr inbounds float* %tmp15977, i64 1
+  %tmp15979 = getelementptr inbounds float* %tmp15978, i64 1
+  %tmp15980 = getelementptr inbounds float* %tmp15979, i64 1
+  %tmp15981 = getelementptr inbounds float* %tmp15980, i64 1
+  %tmp15982 = getelementptr inbounds float* %tmp15981, i64 1
+  %tmp15983 = getelementptr inbounds float* %tmp15982, i64 1
+  %tmp15984 = getelementptr inbounds float* %tmp15983, i64 1
+  %tmp15985 = getelementptr inbounds float* %tmp15984, i64 1
+  %tmp15986 = getelementptr inbounds float* %tmp15985, i64 1
+  %tmp15987 = getelementptr inbounds float* %tmp15986, i64 1
+  %tmp15988 = getelementptr inbounds float* %tmp15987, i64 1
+  %tmp15989 = getelementptr inbounds float* %tmp15988, i64 1
+  %tmp15990 = getelementptr inbounds float* %tmp15989, i64 1
+  %tmp15991 = getelementptr inbounds float* %tmp15990, i64 1
+  %tmp15992 = getelementptr inbounds float* %tmp15991, i64 1
+  %tmp15993 = getelementptr inbounds float* %tmp15992, i64 1
+  %tmp15994 = getelementptr inbounds float* %tmp15993, i64 1
+  %tmp15995 = getelementptr inbounds float* %tmp15994, i64 1
+  %tmp15996 = getelementptr inbounds float* %tmp15995, i64 1
+  %tmp15997 = getelementptr inbounds float* %tmp15996, i64 1
+  %tmp15998 = getelementptr inbounds float* %tmp15997, i64 1
+  %tmp15999 = getelementptr inbounds float* %tmp15998, i64 1
+  %tmp16000 = getelementptr inbounds float* %tmp15999, i64 1
+  %tmp16001 = getelementptr inbounds float* %tmp16000, i64 1
+  %tmp16002 = getelementptr inbounds float* %tmp16001, i64 1
+  %tmp16003 = getelementptr inbounds float* %tmp16002, i64 1
+  %tmp16004 = getelementptr inbounds float* %tmp16003, i64 1
+  %tmp16005 = getelementptr inbounds float* %tmp16004, i64 1
+  %tmp16006 = getelementptr inbounds float* %tmp16005, i64 1
+  %tmp16007 = getelementptr inbounds float* %tmp16006, i64 1
+  %tmp16008 = getelementptr inbounds float* %tmp16007, i64 1
+  %tmp16009 = getelementptr inbounds float* %tmp16008, i64 1
+  %tmp16010 = getelementptr inbounds float* %tmp16009, i64 1
+  %tmp16011 = getelementptr inbounds float* %tmp16010, i64 1
+  %tmp16012 = getelementptr inbounds float* %tmp16011, i64 1
+  %tmp16013 = getelementptr inbounds float* %tmp16012, i64 1
+  %tmp16014 = getelementptr inbounds float* %tmp16013, i64 1
+  %tmp16015 = getelementptr inbounds float* %tmp16014, i64 1
+  %tmp16016 = getelementptr inbounds float* %tmp16015, i64 1
+  %tmp16017 = getelementptr inbounds float* %tmp16016, i64 1
+  %tmp16018 = getelementptr inbounds float* %tmp16017, i64 1
+  %tmp16019 = getelementptr inbounds float* %tmp16018, i64 1
+  %tmp16020 = getelementptr inbounds float* %tmp16019, i64 1
+  %tmp16021 = getelementptr inbounds float* %tmp16020, i64 1
+  %tmp16022 = getelementptr inbounds float* %tmp16021, i64 1
+  %tmp16023 = getelementptr inbounds float* %tmp16022, i64 1
+  %tmp16024 = getelementptr inbounds float* %tmp16023, i64 1
+  %tmp16025 = getelementptr inbounds float* %tmp16024, i64 1
+  %tmp16026 = getelementptr inbounds float* %tmp16025, i64 1
+  %tmp16027 = getelementptr inbounds float* %tmp16026, i64 1
+  %tmp16028 = getelementptr inbounds float* %tmp16027, i64 1
+  %tmp16029 = getelementptr inbounds float* %tmp16028, i64 1
+  %tmp16030 = getelementptr inbounds float* %tmp16029, i64 1
+  %tmp16031 = getelementptr inbounds float* %tmp16030, i64 1
+  %tmp16032 = getelementptr inbounds float* %tmp16031, i64 1
+  %tmp16033 = getelementptr inbounds float* %tmp16032, i64 1
+  %tmp16034 = getelementptr inbounds float* %tmp16033, i64 1
+  %tmp16035 = getelementptr inbounds float* %tmp16034, i64 1
+  %tmp16036 = getelementptr inbounds float* %tmp16035, i64 1
+  %tmp16037 = getelementptr inbounds float* %tmp16036, i64 1
+  %tmp16038 = getelementptr inbounds float* %tmp16037, i64 1
+  %tmp16039 = getelementptr inbounds float* %tmp16038, i64 1
+  %tmp16040 = getelementptr inbounds float* %tmp16039, i64 1
+  %tmp16041 = getelementptr inbounds float* %tmp16040, i64 1
+  %tmp16042 = getelementptr inbounds float* %tmp16041, i64 1
+  %tmp16043 = getelementptr inbounds float* %tmp16042, i64 1
+  %tmp16044 = getelementptr inbounds float* %tmp16043, i64 1
+  %tmp16045 = getelementptr inbounds float* %tmp16044, i64 1
+  %tmp16046 = getelementptr inbounds float* %tmp16045, i64 1
+  %tmp16047 = getelementptr inbounds float* %tmp16046, i64 1
+  %tmp16048 = getelementptr inbounds float* %tmp16047, i64 1
+  %tmp16049 = getelementptr inbounds float* %tmp16048, i64 1
+  %tmp16050 = getelementptr inbounds float* %tmp16049, i64 1
+  %tmp16051 = getelementptr inbounds float* %tmp16050, i64 1
+  %tmp16052 = getelementptr inbounds float* %tmp16051, i64 1
+  %tmp16053 = getelementptr inbounds float* %tmp16052, i64 1
+  %tmp16054 = getelementptr inbounds float* %tmp16053, i64 1
+  %tmp16055 = getelementptr inbounds float* %tmp16054, i64 1
+  %tmp16056 = getelementptr inbounds float* %tmp16055, i64 1
+  %tmp16057 = getelementptr inbounds float* %tmp16056, i64 1
+  %tmp16058 = getelementptr inbounds float* %tmp16057, i64 1
+  %tmp16059 = getelementptr inbounds float* %tmp16058, i64 1
+  %tmp16060 = getelementptr inbounds float* %tmp16059, i64 1
+  %tmp16061 = getelementptr inbounds float* %tmp16060, i64 1
+  %tmp16062 = getelementptr inbounds float* %tmp16061, i64 1
+  %tmp16063 = getelementptr inbounds float* %tmp16062, i64 1
+  %tmp16064 = getelementptr inbounds float* %tmp16063, i64 1
+  %tmp16065 = getelementptr inbounds float* %tmp16064, i64 1
+  %tmp16066 = getelementptr inbounds float* %tmp16065, i64 1
+  %tmp16067 = getelementptr inbounds float* %tmp16066, i64 1
+  %tmp16068 = getelementptr inbounds float* %tmp16067, i64 1
+  %tmp16069 = getelementptr inbounds float* %tmp16068, i64 1
+  %tmp16070 = getelementptr inbounds float* %tmp16069, i64 1
+  %tmp16071 = getelementptr inbounds float* %tmp16070, i64 1
+  %tmp16072 = getelementptr inbounds float* %tmp16071, i64 1
+  %tmp16073 = getelementptr inbounds float* %tmp16072, i64 1
+  %tmp16074 = getelementptr inbounds float* %tmp16073, i64 1
+  %tmp16075 = getelementptr inbounds float* %tmp16074, i64 1
+  %tmp16076 = getelementptr inbounds float* %tmp16075, i64 1
+  %tmp16077 = getelementptr inbounds float* %tmp16076, i64 1
+  %tmp16078 = getelementptr inbounds float* %tmp16077, i64 1
+  %tmp16079 = getelementptr inbounds float* %tmp16078, i64 1
+  %tmp16080 = getelementptr inbounds float* %tmp16079, i64 1
+  %tmp16081 = getelementptr inbounds float* %tmp16080, i64 1
+  %tmp16082 = getelementptr inbounds float* %tmp16081, i64 1
+  %tmp16083 = getelementptr inbounds float* %tmp16082, i64 1
+  %tmp16084 = getelementptr inbounds float* %tmp16083, i64 1
+  %tmp16085 = getelementptr inbounds float* %tmp16084, i64 1
+  %tmp16086 = getelementptr inbounds float* %tmp16085, i64 1
+  %tmp16087 = getelementptr inbounds float* %tmp16086, i64 1
+  %tmp16088 = getelementptr inbounds float* %tmp16087, i64 1
+  %tmp16089 = getelementptr inbounds float* %tmp16088, i64 1
+  %tmp16090 = getelementptr inbounds float* %tmp16089, i64 1
+  %tmp16091 = getelementptr inbounds float* %tmp16090, i64 1
+  %tmp16092 = getelementptr inbounds float* %tmp16091, i64 1
+  %tmp16093 = getelementptr inbounds float* %tmp16092, i64 1
+  %tmp16094 = getelementptr inbounds float* %tmp16093, i64 1
+  %tmp16095 = getelementptr inbounds float* %tmp16094, i64 1
+  %tmp16096 = getelementptr inbounds float* %tmp16095, i64 1
+  %tmp16097 = getelementptr inbounds float* %tmp16096, i64 1
+  %tmp16098 = getelementptr inbounds float* %tmp16097, i64 1
+  %tmp16099 = getelementptr inbounds float* %tmp16098, i64 1
+  %tmp16100 = getelementptr inbounds float* %tmp16099, i64 1
+  %tmp16101 = getelementptr inbounds float* %tmp16100, i64 1
+  %tmp16102 = getelementptr inbounds float* %tmp16101, i64 1
+  %tmp16103 = getelementptr inbounds float* %tmp16102, i64 1
+  %tmp16104 = getelementptr inbounds float* %tmp16103, i64 1
+  %tmp16105 = getelementptr inbounds float* %tmp16104, i64 1
+  %tmp16106 = getelementptr inbounds float* %tmp16105, i64 1
+  %tmp16107 = getelementptr inbounds float* %tmp16106, i64 1
+  %tmp16108 = getelementptr inbounds float* %tmp16107, i64 1
+  %tmp16109 = getelementptr inbounds float* %tmp16108, i64 1
+  %tmp16110 = getelementptr inbounds float* %tmp16109, i64 1
+  %tmp16111 = getelementptr inbounds float* %tmp16110, i64 1
+  %tmp16112 = getelementptr inbounds float* %tmp16111, i64 1
+  %tmp16113 = getelementptr inbounds float* %tmp16112, i64 1
+  %tmp16114 = getelementptr inbounds float* %tmp16113, i64 1
+  %tmp16115 = getelementptr inbounds float* %tmp16114, i64 1
+  %tmp16116 = getelementptr inbounds float* %tmp16115, i64 1
+  %tmp16117 = getelementptr inbounds float* %tmp16116, i64 1
+  %tmp16118 = getelementptr inbounds float* %tmp16117, i64 1
+  %tmp16119 = getelementptr inbounds float* %tmp16118, i64 1
+  %tmp16120 = getelementptr inbounds float* %tmp16119, i64 1
+  %tmp16121 = getelementptr inbounds float* %tmp16120, i64 1
+  %tmp16122 = getelementptr inbounds float* %tmp16121, i64 1
+  %tmp16123 = getelementptr inbounds float* %tmp16122, i64 1
+  %tmp16124 = getelementptr inbounds float* %tmp16123, i64 1
+  %tmp16125 = getelementptr inbounds float* %tmp16124, i64 1
+  %tmp16126 = getelementptr inbounds float* %tmp16125, i64 1
+  %tmp16127 = getelementptr inbounds float* %tmp16126, i64 1
+  %tmp16128 = getelementptr inbounds float* %tmp16127, i64 1
+  %tmp16129 = getelementptr inbounds float* %tmp16128, i64 1
+  %tmp16130 = getelementptr inbounds float* %tmp16129, i64 1
+  %tmp16131 = getelementptr inbounds float* %tmp16130, i64 1
+  %tmp16132 = getelementptr inbounds float* %tmp16131, i64 1
+  %tmp16133 = getelementptr inbounds float* %tmp16132, i64 1
+  %tmp16134 = getelementptr inbounds float* %tmp16133, i64 1
+  %tmp16135 = getelementptr inbounds float* %tmp16134, i64 1
+  %tmp16136 = getelementptr inbounds float* %tmp16135, i64 1
+  %tmp16137 = getelementptr inbounds float* %tmp16136, i64 1
+  %tmp16138 = getelementptr inbounds float* %tmp16137, i64 1
+  %tmp16139 = getelementptr inbounds float* %tmp16138, i64 1
+  %tmp16140 = getelementptr inbounds float* %tmp16139, i64 1
+  %tmp16141 = getelementptr inbounds float* %tmp16140, i64 1
+  %tmp16142 = getelementptr inbounds float* %tmp16141, i64 1
+  %tmp16143 = getelementptr inbounds float* %tmp16142, i64 1
+  %tmp16144 = getelementptr inbounds float* %tmp16143, i64 1
+  %tmp16145 = getelementptr inbounds float* %tmp16144, i64 1
+  %tmp16146 = getelementptr inbounds float* %tmp16145, i64 1
+  %tmp16147 = getelementptr inbounds float* %tmp16146, i64 1
+  %tmp16148 = getelementptr inbounds float* %tmp16147, i64 1
+  %tmp16149 = getelementptr inbounds float* %tmp16148, i64 1
+  %tmp16150 = getelementptr inbounds float* %tmp16149, i64 1
+  %tmp16151 = getelementptr inbounds float* %tmp16150, i64 1
+  %tmp16152 = getelementptr inbounds float* %tmp16151, i64 1
+  %tmp16153 = getelementptr inbounds float* %tmp16152, i64 1
+  %tmp16154 = getelementptr inbounds float* %tmp16153, i64 1
+  %tmp16155 = getelementptr inbounds float* %tmp16154, i64 1
+  %tmp16156 = getelementptr inbounds float* %tmp16155, i64 1
+  %tmp16157 = getelementptr inbounds float* %tmp16156, i64 1
+  %tmp16158 = getelementptr inbounds float* %tmp16157, i64 1
+  %tmp16159 = getelementptr inbounds float* %tmp16158, i64 1
+  %tmp16160 = getelementptr inbounds float* %tmp16159, i64 1
+  %tmp16161 = getelementptr inbounds float* %tmp16160, i64 1
+  %tmp16162 = getelementptr inbounds float* %tmp16161, i64 1
+  %tmp16163 = getelementptr inbounds float* %tmp16162, i64 1
+  %tmp16164 = getelementptr inbounds float* %tmp16163, i64 1
+  %tmp16165 = getelementptr inbounds float* %tmp16164, i64 1
+  %tmp16166 = getelementptr inbounds float* %tmp16165, i64 1
+  %tmp16167 = getelementptr inbounds float* %tmp16166, i64 1
+  %tmp16168 = getelementptr inbounds float* %tmp16167, i64 1
+  %tmp16169 = getelementptr inbounds float* %tmp16168, i64 1
+  %tmp16170 = getelementptr inbounds float* %tmp16169, i64 1
+  %tmp16171 = getelementptr inbounds float* %tmp16170, i64 1
+  %tmp16172 = getelementptr inbounds float* %tmp16171, i64 1
+  %tmp16173 = getelementptr inbounds float* %tmp16172, i64 1
+  %tmp16174 = getelementptr inbounds float* %tmp16173, i64 1
+  %tmp16175 = getelementptr inbounds float* %tmp16174, i64 1
+  %tmp16176 = getelementptr inbounds float* %tmp16175, i64 1
+  %tmp16177 = getelementptr inbounds float* %tmp16176, i64 1
+  %tmp16178 = getelementptr inbounds float* %tmp16177, i64 1
+  %tmp16179 = getelementptr inbounds float* %tmp16178, i64 1
+  %tmp16180 = getelementptr inbounds float* %tmp16179, i64 1
+  %tmp16181 = getelementptr inbounds float* %tmp16180, i64 1
+  %tmp16182 = getelementptr inbounds float* %tmp16181, i64 1
+  %tmp16183 = getelementptr inbounds float* %tmp16182, i64 1
+  %tmp16184 = getelementptr inbounds float* %tmp16183, i64 1
+  %tmp16185 = getelementptr inbounds float* %tmp16184, i64 1
+  %tmp16186 = getelementptr inbounds float* %tmp16185, i64 1
+  %tmp16187 = getelementptr inbounds float* %tmp16186, i64 1
+  %tmp16188 = getelementptr inbounds float* %tmp16187, i64 1
+  %tmp16189 = getelementptr inbounds float* %tmp16188, i64 1
+  %tmp16190 = getelementptr inbounds float* %tmp16189, i64 1
+  %tmp16191 = getelementptr inbounds float* %tmp16190, i64 1
+  %tmp16192 = getelementptr inbounds float* %tmp16191, i64 1
+  %tmp16193 = getelementptr inbounds float* %tmp16192, i64 1
+  %tmp16194 = getelementptr inbounds float* %tmp16193, i64 1
+  %tmp16195 = getelementptr inbounds float* %tmp16194, i64 1
+  %tmp16196 = getelementptr inbounds float* %tmp16195, i64 1
+  %tmp16197 = getelementptr inbounds float* %tmp16196, i64 1
+  %tmp16198 = getelementptr inbounds float* %tmp16197, i64 1
+  %tmp16199 = getelementptr inbounds float* %tmp16198, i64 1
+  %tmp16200 = getelementptr inbounds float* %tmp16199, i64 1
+  %tmp16201 = getelementptr inbounds float* %tmp16200, i64 1
+  %tmp16202 = getelementptr inbounds float* %tmp16201, i64 1
+  %tmp16203 = getelementptr inbounds float* %tmp16202, i64 1
+  %tmp16204 = getelementptr inbounds float* %tmp16203, i64 1
+  %tmp16205 = getelementptr inbounds float* %tmp16204, i64 1
+  %tmp16206 = getelementptr inbounds float* %tmp16205, i64 1
+  %tmp16207 = getelementptr inbounds float* %tmp16206, i64 1
+  %tmp16208 = getelementptr inbounds float* %tmp16207, i64 1
+  %tmp16209 = getelementptr inbounds float* %tmp16208, i64 1
+  %tmp16210 = getelementptr inbounds float* %tmp16209, i64 1
+  %tmp16211 = getelementptr inbounds float* %tmp16210, i64 1
+  %tmp16212 = getelementptr inbounds float* %tmp16211, i64 1
+  %tmp16213 = getelementptr inbounds float* %tmp16212, i64 1
+  %tmp16214 = getelementptr inbounds float* %tmp16213, i64 1
+  %tmp16215 = getelementptr inbounds float* %tmp16214, i64 1
+  %tmp16216 = getelementptr inbounds float* %tmp16215, i64 1
+  %tmp16217 = getelementptr inbounds float* %tmp16216, i64 1
+  %tmp16218 = getelementptr inbounds float* %tmp16217, i64 1
+  %tmp16219 = getelementptr inbounds float* %tmp16218, i64 1
+  %tmp16220 = getelementptr inbounds float* %tmp16219, i64 1
+  %tmp16221 = getelementptr inbounds float* %tmp16220, i64 1
+  %tmp16222 = getelementptr inbounds float* %tmp16221, i64 1
+  %tmp16223 = getelementptr inbounds float* %tmp16222, i64 1
+  %tmp16224 = getelementptr inbounds float* %tmp16223, i64 1
+  %tmp16225 = getelementptr inbounds float* %tmp16224, i64 1
+  %tmp16226 = getelementptr inbounds float* %tmp16225, i64 1
+  %tmp16227 = getelementptr inbounds float* %tmp16226, i64 1
+  %tmp16228 = getelementptr inbounds float* %tmp16227, i64 1
+  %tmp16229 = getelementptr inbounds float* %tmp16228, i64 1
+  %tmp16230 = getelementptr inbounds float* %tmp16229, i64 1
+  %tmp16231 = getelementptr inbounds float* %tmp16230, i64 1
+  %tmp16232 = getelementptr inbounds float* %tmp16231, i64 1
+  %tmp16233 = getelementptr inbounds float* %tmp16232, i64 1
+  %tmp16234 = getelementptr inbounds float* %tmp16233, i64 1
+  %tmp16235 = getelementptr inbounds float* %tmp16234, i64 1
+  %tmp16236 = getelementptr inbounds float* %tmp16235, i64 1
+  %tmp16237 = getelementptr inbounds float* %tmp16236, i64 1
+  %tmp16238 = getelementptr inbounds float* %tmp16237, i64 1
+  %tmp16239 = getelementptr inbounds float* %tmp16238, i64 1
+  %tmp16240 = getelementptr inbounds float* %tmp16239, i64 1
+  %tmp16241 = getelementptr inbounds float* %tmp16240, i64 1
+  %tmp16242 = getelementptr inbounds float* %tmp16241, i64 1
+  %tmp16243 = getelementptr inbounds float* %tmp16242, i64 1
+  %tmp16244 = getelementptr inbounds float* %tmp16243, i64 1
+  %tmp16245 = getelementptr inbounds float* %tmp16244, i64 1
+  %tmp16246 = getelementptr inbounds float* %tmp16245, i64 1
+  %tmp16247 = getelementptr inbounds float* %tmp16246, i64 1
+  %tmp16248 = getelementptr inbounds float* %tmp16247, i64 1
+  %tmp16249 = getelementptr inbounds float* %tmp16248, i64 1
+  %tmp16250 = getelementptr inbounds float* %tmp16249, i64 1
+  %tmp16251 = getelementptr inbounds float* %tmp16250, i64 1
+  %tmp16252 = getelementptr inbounds float* %tmp16251, i64 1
+  %tmp16253 = getelementptr inbounds float* %tmp16252, i64 1
+  %tmp16254 = getelementptr inbounds float* %tmp16253, i64 1
+  %tmp16255 = getelementptr inbounds float* %tmp16254, i64 1
+  %tmp16256 = getelementptr inbounds float* %tmp16255, i64 1
+  %tmp16257 = getelementptr inbounds float* %tmp16256, i64 1
+  %tmp16258 = getelementptr inbounds float* %tmp16257, i64 1
+  %tmp16259 = getelementptr inbounds float* %tmp16258, i64 1
+  %tmp16260 = getelementptr inbounds float* %tmp16259, i64 1
+  %tmp16261 = getelementptr inbounds float* %tmp16260, i64 1
+  %tmp16262 = getelementptr inbounds float* %tmp16261, i64 1
+  %tmp16263 = getelementptr inbounds float* %tmp16262, i64 1
+  %tmp16264 = getelementptr inbounds float* %tmp16263, i64 1
+  %tmp16265 = getelementptr inbounds float* %tmp16264, i64 1
+  %tmp16266 = getelementptr inbounds float* %tmp16265, i64 1
+  %tmp16267 = getelementptr inbounds float* %tmp16266, i64 1
+  %tmp16268 = getelementptr inbounds float* %tmp16267, i64 1
+  %tmp16269 = getelementptr inbounds float* %tmp16268, i64 1
+  %tmp16270 = getelementptr inbounds float* %tmp16269, i64 1
+  %tmp16271 = getelementptr inbounds float* %tmp16270, i64 1
+  %tmp16272 = getelementptr inbounds float* %tmp16271, i64 1
+  %tmp16273 = getelementptr inbounds float* %tmp16272, i64 1
+  %tmp16274 = getelementptr inbounds float* %tmp16273, i64 1
+  %tmp16275 = getelementptr inbounds float* %tmp16274, i64 1
+  %tmp16276 = getelementptr inbounds float* %tmp16275, i64 1
+  %tmp16277 = getelementptr inbounds float* %tmp16276, i64 1
+  %tmp16278 = getelementptr inbounds float* %tmp16277, i64 1
+  %tmp16279 = getelementptr inbounds float* %tmp16278, i64 1
+  %tmp16280 = getelementptr inbounds float* %tmp16279, i64 1
+  %tmp16281 = getelementptr inbounds float* %tmp16280, i64 1
+  %tmp16282 = getelementptr inbounds float* %tmp16281, i64 1
+  %tmp16283 = getelementptr inbounds float* %tmp16282, i64 1
+  %tmp16284 = getelementptr inbounds float* %tmp16283, i64 1
+  %tmp16285 = getelementptr inbounds float* %tmp16284, i64 1
+  %tmp16286 = getelementptr inbounds float* %tmp16285, i64 1
+  %tmp16287 = getelementptr inbounds float* %tmp16286, i64 1
+  %tmp16288 = getelementptr inbounds float* %tmp16287, i64 1
+  %tmp16289 = getelementptr inbounds float* %tmp16288, i64 1
+  %tmp16290 = getelementptr inbounds float* %tmp16289, i64 1
+  %tmp16291 = getelementptr inbounds float* %tmp16290, i64 1
+  %tmp16292 = getelementptr inbounds float* %tmp16291, i64 1
+  %tmp16293 = getelementptr inbounds float* %tmp16292, i64 1
+  %tmp16294 = getelementptr inbounds float* %tmp16293, i64 1
+  %tmp16295 = getelementptr inbounds float* %tmp16294, i64 1
+  %tmp16296 = getelementptr inbounds float* %tmp16295, i64 1
+  %tmp16297 = getelementptr inbounds float* %tmp16296, i64 1
+  %tmp16298 = getelementptr inbounds float* %tmp16297, i64 1
+  %tmp16299 = getelementptr inbounds float* %tmp16298, i64 1
+  %tmp16300 = getelementptr inbounds float* %tmp16299, i64 1
+  %tmp16301 = getelementptr inbounds float* %tmp16300, i64 1
+  %tmp16302 = getelementptr inbounds float* %tmp16301, i64 1
+  %tmp16303 = getelementptr inbounds float* %tmp16302, i64 1
+  %tmp16304 = getelementptr inbounds float* %tmp16303, i64 1
+  %tmp16305 = getelementptr inbounds float* %tmp16304, i64 1
+  %tmp16306 = getelementptr inbounds float* %tmp16305, i64 1
+  %tmp16307 = getelementptr inbounds float* %tmp16306, i64 1
+  %tmp16308 = getelementptr inbounds float* %tmp16307, i64 1
+  %tmp16309 = getelementptr inbounds float* %tmp16308, i64 1
+  %tmp16310 = getelementptr inbounds float* %tmp16309, i64 1
+  %tmp16311 = getelementptr inbounds float* %tmp16310, i64 1
+  %tmp16312 = getelementptr inbounds float* %tmp16311, i64 1
+  %tmp16313 = getelementptr inbounds float* %tmp16312, i64 1
+  %tmp16314 = getelementptr inbounds float* %tmp16313, i64 1
+  %tmp16315 = getelementptr inbounds float* %tmp16314, i64 1
+  %tmp16316 = getelementptr inbounds float* %tmp16315, i64 1
+  %tmp16317 = getelementptr inbounds float* %tmp16316, i64 1
+  %tmp16318 = getelementptr inbounds float* %tmp16317, i64 1
+  %tmp16319 = getelementptr inbounds float* %tmp16318, i64 1
+  %tmp16320 = getelementptr inbounds float* %tmp16319, i64 1
+  %tmp16321 = getelementptr inbounds float* %tmp16320, i64 1
+  %tmp16322 = getelementptr inbounds float* %tmp16321, i64 1
+  %tmp16323 = getelementptr inbounds float* %tmp16322, i64 1
+  %tmp16324 = getelementptr inbounds float* %tmp16323, i64 1
+  %tmp16325 = getelementptr inbounds float* %tmp16324, i64 1
+  %tmp16326 = getelementptr inbounds float* %tmp16325, i64 1
+  %tmp16327 = getelementptr inbounds float* %tmp16326, i64 1
+  %tmp16328 = getelementptr inbounds float* %tmp16327, i64 1
+  %tmp16329 = getelementptr inbounds float* %tmp16328, i64 1
+  %tmp16330 = getelementptr inbounds float* %tmp16329, i64 1
+  %tmp16331 = getelementptr inbounds float* %tmp16330, i64 1
+  %tmp16332 = getelementptr inbounds float* %tmp16331, i64 1
+  %tmp16333 = getelementptr inbounds float* %tmp16332, i64 1
+  %tmp16334 = getelementptr inbounds float* %tmp16333, i64 1
+  %tmp16335 = getelementptr inbounds float* %tmp16334, i64 1
+  %tmp16336 = getelementptr inbounds float* %tmp16335, i64 1
+  %tmp16337 = getelementptr inbounds float* %tmp16336, i64 1
+  %tmp16338 = getelementptr inbounds float* %tmp16337, i64 1
+  %tmp16339 = getelementptr inbounds float* %tmp16338, i64 1
+  %tmp16340 = getelementptr inbounds float* %tmp16339, i64 1
+  %tmp16341 = getelementptr inbounds float* %tmp16340, i64 1
+  %tmp16342 = getelementptr inbounds float* %tmp16341, i64 1
+  %tmp16343 = getelementptr inbounds float* %tmp16342, i64 1
+  %tmp16344 = getelementptr inbounds float* %tmp16343, i64 1
+  %tmp16345 = getelementptr inbounds float* %tmp16344, i64 1
+  %tmp16346 = getelementptr inbounds float* %tmp16345, i64 1
+  %tmp16347 = getelementptr inbounds float* %tmp16346, i64 1
+  %tmp16348 = getelementptr inbounds float* %tmp16347, i64 1
+  %tmp16349 = getelementptr inbounds float* %tmp16348, i64 1
+  %tmp16350 = getelementptr inbounds float* %tmp16349, i64 1
+  %tmp16351 = getelementptr inbounds float* %tmp16350, i64 1
+  %tmp16352 = getelementptr inbounds float* %tmp16351, i64 1
+  %tmp16353 = getelementptr inbounds float* %tmp16352, i64 1
+  %tmp16354 = getelementptr inbounds float* %tmp16353, i64 1
+  %tmp16355 = getelementptr inbounds float* %tmp16354, i64 1
+  %tmp16356 = getelementptr inbounds float* %tmp16355, i64 1
+  %tmp16357 = getelementptr inbounds float* %tmp16356, i64 1
+  %tmp16358 = getelementptr inbounds float* %tmp16357, i64 1
+  %tmp16359 = getelementptr inbounds float* %tmp16358, i64 1
+  %tmp16360 = getelementptr inbounds float* %tmp16359, i64 1
+  %tmp16361 = getelementptr inbounds float* %tmp16360, i64 1
+  %tmp16362 = getelementptr inbounds float* %tmp16361, i64 1
+  %tmp16363 = getelementptr inbounds float* %tmp16362, i64 1
+  %tmp16364 = getelementptr inbounds float* %tmp16363, i64 1
+  %tmp16365 = getelementptr inbounds float* %tmp16364, i64 1
+  %tmp16366 = getelementptr inbounds float* %tmp16365, i64 1
+  %tmp16367 = getelementptr inbounds float* %tmp16366, i64 1
+  %tmp16368 = getelementptr inbounds float* %tmp16367, i64 1
+  %tmp16369 = getelementptr inbounds float* %tmp16368, i64 1
+  %tmp16370 = getelementptr inbounds float* %tmp16369, i64 1
+  %tmp16371 = getelementptr inbounds float* %tmp16370, i64 1
+  %tmp16372 = getelementptr inbounds float* %tmp16371, i64 1
+  %tmp16373 = getelementptr inbounds float* %tmp16372, i64 1
+  %tmp16374 = getelementptr inbounds float* %tmp16373, i64 1
+  %tmp16375 = getelementptr inbounds float* %tmp16374, i64 1
+  %tmp16376 = getelementptr inbounds float* %tmp16375, i64 1
+  %tmp16377 = getelementptr inbounds float* %tmp16376, i64 1
+  %tmp16378 = getelementptr inbounds float* %tmp16377, i64 1
+  %tmp16379 = getelementptr inbounds float* %tmp16378, i64 1
+  %tmp16380 = getelementptr inbounds float* %tmp16379, i64 1
+  %tmp16381 = getelementptr inbounds float* %tmp16380, i64 1
+  %tmp16382 = getelementptr inbounds float* %tmp16381, i64 1
+  %tmp16383 = getelementptr inbounds float* %tmp16382, i64 1
+  %tmp16384 = getelementptr inbounds float* %tmp16383, i64 1
+  %tmp16385 = getelementptr inbounds float* %tmp16384, i64 1
+  %tmp16386 = getelementptr inbounds float* %tmp16385, i64 1
+  %tmp16387 = getelementptr inbounds float* %tmp16386, i64 1
+  %tmp16388 = getelementptr inbounds float* %tmp16387, i64 1
+  %tmp16389 = getelementptr inbounds float* %tmp16388, i64 1
+  %tmp16390 = getelementptr inbounds float* %tmp16389, i64 1
+  %tmp16391 = getelementptr inbounds float* %tmp16390, i64 1
+  %tmp16392 = getelementptr inbounds float* %tmp16391, i64 1
+  %tmp16393 = getelementptr inbounds float* %tmp16392, i64 1
+  %tmp16394 = getelementptr inbounds float* %tmp16393, i64 1
+  %tmp16395 = getelementptr inbounds float* %tmp16394, i64 1
+  %tmp16396 = getelementptr inbounds float* %tmp16395, i64 1
+  %tmp16397 = getelementptr inbounds float* %tmp16396, i64 1
+  %tmp16398 = getelementptr inbounds float* %tmp16397, i64 1
+  %tmp16399 = getelementptr inbounds float* %tmp16398, i64 1
+  %tmp16400 = getelementptr inbounds float* %tmp16399, i64 1
+  %tmp16401 = getelementptr inbounds float* %tmp16400, i64 1
+  %tmp16402 = getelementptr inbounds float* %tmp16401, i64 1
+  %tmp16403 = getelementptr inbounds float* %tmp16402, i64 1
+  %tmp16404 = getelementptr inbounds float* %tmp16403, i64 1
+  %tmp16405 = getelementptr inbounds float* %tmp16404, i64 1
+  %tmp16406 = getelementptr inbounds float* %tmp16405, i64 1
+  %tmp16407 = getelementptr inbounds float* %tmp16406, i64 1
+  %tmp16408 = getelementptr inbounds float* %tmp16407, i64 1
+  %tmp16409 = getelementptr inbounds float* %tmp16408, i64 1
+  %tmp16410 = getelementptr inbounds float* %tmp16409, i64 1
+  %tmp16411 = getelementptr inbounds float* %tmp16410, i64 1
+  %tmp16412 = getelementptr inbounds float* %tmp16411, i64 1
+  %tmp16413 = getelementptr inbounds float* %tmp16412, i64 1
+  %tmp16414 = getelementptr inbounds float* %tmp16413, i64 1
+  %tmp16415 = getelementptr inbounds float* %tmp16414, i64 1
+  %tmp16416 = getelementptr inbounds float* %tmp16415, i64 1
+  %tmp16417 = getelementptr inbounds float* %tmp16416, i64 1
+  %tmp16418 = getelementptr inbounds float* %tmp16417, i64 1
+  %tmp16419 = getelementptr inbounds float* %tmp16418, i64 1
+  %tmp16420 = getelementptr inbounds float* %tmp16419, i64 1
+  %tmp16421 = getelementptr inbounds float* %tmp16420, i64 1
+  %tmp16422 = getelementptr inbounds float* %tmp16421, i64 1
+  %tmp16423 = getelementptr inbounds float* %tmp16422, i64 1
+  %tmp16424 = getelementptr inbounds float* %tmp16423, i64 1
+  %tmp16425 = getelementptr inbounds float* %tmp16424, i64 1
+  %tmp16426 = getelementptr inbounds float* %tmp16425, i64 1
+  %tmp16427 = getelementptr inbounds float* %tmp16426, i64 1
+  %tmp16428 = getelementptr inbounds float* %tmp16427, i64 1
+  %tmp16429 = getelementptr inbounds float* %tmp16428, i64 1
+  %tmp16430 = getelementptr inbounds float* %tmp16429, i64 1
+  %tmp16431 = getelementptr inbounds float* %tmp16430, i64 1
+  %tmp16432 = getelementptr inbounds float* %tmp16431, i64 1
+  %tmp16433 = getelementptr inbounds float* %tmp16432, i64 1
+  %tmp16434 = getelementptr inbounds float* %tmp16433, i64 1
+  %tmp16435 = getelementptr inbounds float* %tmp16434, i64 1
+  %tmp16436 = getelementptr inbounds float* %tmp16435, i64 1
+  %tmp16437 = getelementptr inbounds float* %tmp16436, i64 1
+  %tmp16438 = getelementptr inbounds float* %tmp16437, i64 1
+  %tmp16439 = getelementptr inbounds float* %tmp16438, i64 1
+  %tmp16440 = getelementptr inbounds float* %tmp16439, i64 1
+  %tmp16441 = getelementptr inbounds float* %tmp16440, i64 1
+  %tmp16442 = getelementptr inbounds float* %tmp16441, i64 1
+  %tmp16443 = getelementptr inbounds float* %tmp16442, i64 1
+  %tmp16444 = getelementptr inbounds float* %tmp16443, i64 1
+  %tmp16445 = getelementptr inbounds float* %tmp16444, i64 1
+  %tmp16446 = getelementptr inbounds float* %tmp16445, i64 1
+  %tmp16447 = getelementptr inbounds float* %tmp16446, i64 1
+  %tmp16448 = getelementptr inbounds float* %tmp16447, i64 1
+  %tmp16449 = getelementptr inbounds float* %tmp16448, i64 1
+  %tmp16450 = getelementptr inbounds float* %tmp16449, i64 1
+  %tmp16451 = getelementptr inbounds float* %tmp16450, i64 1
+  %tmp16452 = getelementptr inbounds float* %tmp16451, i64 1
+  %tmp16453 = getelementptr inbounds float* %tmp16452, i64 1
+  %tmp16454 = getelementptr inbounds float* %tmp16453, i64 1
+  %tmp16455 = getelementptr inbounds float* %tmp16454, i64 1
+  %tmp16456 = getelementptr inbounds float* %tmp16455, i64 1
+  %tmp16457 = getelementptr inbounds float* %tmp16456, i64 1
+  %tmp16458 = getelementptr inbounds float* %tmp16457, i64 1
+  %tmp16459 = getelementptr inbounds float* %tmp16458, i64 1
+  %tmp16460 = getelementptr inbounds float* %tmp16459, i64 1
+  %tmp16461 = getelementptr inbounds float* %tmp16460, i64 1
+  %tmp16462 = getelementptr inbounds float* %tmp16461, i64 1
+  %tmp16463 = getelementptr inbounds float* %tmp16462, i64 1
+  %tmp16464 = getelementptr inbounds float* %tmp16463, i64 1
+  %tmp16465 = getelementptr inbounds float* %tmp16464, i64 1
+  %tmp16466 = getelementptr inbounds float* %tmp16465, i64 1
+  %tmp16467 = getelementptr inbounds float* %tmp16466, i64 1
+  %tmp16468 = getelementptr inbounds float* %tmp16467, i64 1
+  %tmp16469 = getelementptr inbounds float* %tmp16468, i64 1
+  %tmp16470 = getelementptr inbounds float* %tmp16469, i64 1
+  %tmp16471 = getelementptr inbounds float* %tmp16470, i64 1
+  %tmp16472 = getelementptr inbounds float* %tmp16471, i64 1
+  %tmp16473 = getelementptr inbounds float* %tmp16472, i64 1
+  %tmp16474 = getelementptr inbounds float* %tmp16473, i64 1
+  %tmp16475 = getelementptr inbounds float* %tmp16474, i64 1
+  %tmp16476 = getelementptr inbounds float* %tmp16475, i64 1
+  %tmp16477 = getelementptr inbounds float* %tmp16476, i64 1
+  %tmp16478 = getelementptr inbounds float* %tmp16477, i64 1
+  %tmp16479 = getelementptr inbounds float* %tmp16478, i64 1
+  %tmp16480 = getelementptr inbounds float* %tmp16479, i64 1
+  %tmp16481 = getelementptr inbounds float* %tmp16480, i64 1
+  %tmp16482 = getelementptr inbounds float* %tmp16481, i64 1
+  %tmp16483 = getelementptr inbounds float* %tmp16482, i64 1
+  %tmp16484 = getelementptr inbounds float* %tmp16483, i64 1
+  %tmp16485 = getelementptr inbounds float* %tmp16484, i64 1
+  %tmp16486 = getelementptr inbounds float* %tmp16485, i64 1
+  %tmp16487 = getelementptr inbounds float* %tmp16486, i64 1
+  %tmp16488 = getelementptr inbounds float* %tmp16487, i64 1
+  %tmp16489 = getelementptr inbounds float* %tmp16488, i64 1
+  %tmp16490 = getelementptr inbounds float* %tmp16489, i64 1
+  %tmp16491 = getelementptr inbounds float* %tmp16490, i64 1
+  %tmp16492 = getelementptr inbounds float* %tmp16491, i64 1
+  %tmp16493 = getelementptr inbounds float* %tmp16492, i64 1
+  %tmp16494 = getelementptr inbounds float* %tmp16493, i64 1
+  %tmp16495 = getelementptr inbounds float* %tmp16494, i64 1
+  %tmp16496 = getelementptr inbounds float* %tmp16495, i64 1
+  %tmp16497 = getelementptr inbounds float* %tmp16496, i64 1
+  %tmp16498 = getelementptr inbounds float* %tmp16497, i64 1
+  %tmp16499 = getelementptr inbounds float* %tmp16498, i64 1
+  %tmp16500 = getelementptr inbounds float* %tmp16499, i64 1
+  %tmp16501 = getelementptr inbounds float* %tmp16500, i64 1
+  %tmp16502 = getelementptr inbounds float* %tmp16501, i64 1
+  %tmp16503 = getelementptr inbounds float* %tmp16502, i64 1
+  %tmp16504 = getelementptr inbounds float* %tmp16503, i64 1
+  %tmp16505 = getelementptr inbounds float* %tmp16504, i64 1
+  %tmp16506 = getelementptr inbounds float* %tmp16505, i64 1
+  %tmp16507 = getelementptr inbounds float* %tmp16506, i64 1
+  %tmp16508 = getelementptr inbounds float* %tmp16507, i64 1
+  %tmp16509 = getelementptr inbounds float* %tmp16508, i64 1
+  %tmp16510 = getelementptr inbounds float* %tmp16509, i64 1
+  %tmp16511 = getelementptr inbounds float* %tmp16510, i64 1
+  %tmp16512 = getelementptr inbounds float* %tmp16511, i64 1
+  %tmp16513 = getelementptr inbounds float* %tmp16512, i64 1
+  %tmp16514 = getelementptr inbounds float* %tmp16513, i64 1
+  %tmp16515 = getelementptr inbounds float* %tmp16514, i64 1
+  %tmp16516 = getelementptr inbounds float* %tmp16515, i64 1
+  %tmp16517 = getelementptr inbounds float* %tmp16516, i64 1
+  %tmp16518 = getelementptr inbounds float* %tmp16517, i64 1
+  %tmp16519 = getelementptr inbounds float* %tmp16518, i64 1
+  %tmp16520 = getelementptr inbounds float* %tmp16519, i64 1
+  %tmp16521 = getelementptr inbounds float* %tmp16520, i64 1
+  %tmp16522 = getelementptr inbounds float* %tmp16521, i64 1
+  %tmp16523 = getelementptr inbounds float* %tmp16522, i64 1
+  %tmp16524 = getelementptr inbounds float* %tmp16523, i64 1
+  %tmp16525 = getelementptr inbounds float* %tmp16524, i64 1
+  %tmp16526 = getelementptr inbounds float* %tmp16525, i64 1
+  %tmp16527 = getelementptr inbounds float* %tmp16526, i64 1
+  %tmp16528 = getelementptr inbounds float* %tmp16527, i64 1
+  %tmp16529 = getelementptr inbounds float* %tmp16528, i64 1
+  %tmp16530 = getelementptr inbounds float* %tmp16529, i64 1
+  %tmp16531 = getelementptr inbounds float* %tmp16530, i64 1
+  %tmp16532 = getelementptr inbounds float* %tmp16531, i64 1
+  %tmp16533 = getelementptr inbounds float* %tmp16532, i64 1
+  %tmp16534 = getelementptr inbounds float* %tmp16533, i64 1
+  %tmp16535 = getelementptr inbounds float* %tmp16534, i64 1
+  %tmp16536 = getelementptr inbounds float* %tmp16535, i64 1
+  %tmp16537 = getelementptr inbounds float* %tmp16536, i64 1
+  %tmp16538 = getelementptr inbounds float* %tmp16537, i64 1
+  %tmp16539 = getelementptr inbounds float* %tmp16538, i64 1
+  %tmp16540 = getelementptr inbounds float* %tmp16539, i64 1
+  %tmp16541 = getelementptr inbounds float* %tmp16540, i64 1
+  %tmp16542 = getelementptr inbounds float* %tmp16541, i64 1
+  %tmp16543 = getelementptr inbounds float* %tmp16542, i64 1
+  %tmp16544 = getelementptr inbounds float* %tmp16543, i64 1
+  %tmp16545 = getelementptr inbounds float* %tmp16544, i64 1
+  %tmp16546 = getelementptr inbounds float* %tmp16545, i64 1
+  %tmp16547 = getelementptr inbounds float* %tmp16546, i64 1
+  %tmp16548 = getelementptr inbounds float* %tmp16547, i64 1
+  %tmp16549 = getelementptr inbounds float* %tmp16548, i64 1
+  %tmp16550 = getelementptr inbounds float* %tmp16549, i64 1
+  %tmp16551 = getelementptr inbounds float* %tmp16550, i64 1
+  %tmp16552 = getelementptr inbounds float* %tmp16551, i64 1
+  %tmp16553 = getelementptr inbounds float* %tmp16552, i64 1
+  %tmp16554 = getelementptr inbounds float* %tmp16553, i64 1
+  %tmp16555 = getelementptr inbounds float* %tmp16554, i64 1
+  %tmp16556 = getelementptr inbounds float* %tmp16555, i64 1
+  %tmp16557 = getelementptr inbounds float* %tmp16556, i64 1
+  %tmp16558 = getelementptr inbounds float* %tmp16557, i64 1
+  %tmp16559 = getelementptr inbounds float* %tmp16558, i64 1
+  %tmp16560 = getelementptr inbounds float* %tmp16559, i64 1
+  %tmp16561 = getelementptr inbounds float* %tmp16560, i64 1
+  %tmp16562 = getelementptr inbounds float* %tmp16561, i64 1
+  %tmp16563 = getelementptr inbounds float* %tmp16562, i64 1
+  %tmp16564 = getelementptr inbounds float* %tmp16563, i64 1
+  %tmp16565 = getelementptr inbounds float* %tmp16564, i64 1
+  %tmp16566 = getelementptr inbounds float* %tmp16565, i64 1
+  %tmp16567 = getelementptr inbounds float* %tmp16566, i64 1
+  %tmp16568 = getelementptr inbounds float* %tmp16567, i64 1
+  %tmp16569 = getelementptr inbounds float* %tmp16568, i64 1
+  %tmp16570 = getelementptr inbounds float* %tmp16569, i64 1
+  %tmp16571 = getelementptr inbounds float* %tmp16570, i64 1
+  %tmp16572 = getelementptr inbounds float* %tmp16571, i64 1
+  %tmp16573 = getelementptr inbounds float* %tmp16572, i64 1
+  %tmp16574 = getelementptr inbounds float* %tmp16573, i64 1
+  %tmp16575 = getelementptr inbounds float* %tmp16574, i64 1
+  %tmp16576 = getelementptr inbounds float* %tmp16575, i64 1
+  %tmp16577 = getelementptr inbounds float* %tmp16576, i64 1
+  %tmp16578 = getelementptr inbounds float* %tmp16577, i64 1
+  %tmp16579 = getelementptr inbounds float* %tmp16578, i64 1
+  %tmp16580 = getelementptr inbounds float* %tmp16579, i64 1
+  %tmp16581 = getelementptr inbounds float* %tmp16580, i64 1
+  %tmp16582 = getelementptr inbounds float* %tmp16581, i64 1
+  %tmp16583 = getelementptr inbounds float* %tmp16582, i64 1
+  %tmp16584 = getelementptr inbounds float* %tmp16583, i64 1
+  %tmp16585 = getelementptr inbounds float* %tmp16584, i64 1
+  %tmp16586 = getelementptr inbounds float* %tmp16585, i64 1
+  %tmp16587 = getelementptr inbounds float* %tmp16586, i64 1
+  %tmp16588 = getelementptr inbounds float* %tmp16587, i64 1
+  %tmp16589 = getelementptr inbounds float* %tmp16588, i64 1
+  %tmp16590 = getelementptr inbounds float* %tmp16589, i64 1
+  %tmp16591 = getelementptr inbounds float* %tmp16590, i64 1
+  %tmp16592 = getelementptr inbounds float* %tmp16591, i64 1
+  %tmp16593 = getelementptr inbounds float* %tmp16592, i64 1
+  %tmp16594 = getelementptr inbounds float* %tmp16593, i64 1
+  %tmp16595 = getelementptr inbounds float* %tmp16594, i64 1
+  %tmp16596 = getelementptr inbounds float* %tmp16595, i64 1
+  %tmp16597 = getelementptr inbounds float* %tmp16596, i64 1
+  %tmp16598 = getelementptr inbounds float* %tmp16597, i64 1
+  %tmp16599 = getelementptr inbounds float* %tmp16598, i64 1
+  %tmp16600 = getelementptr inbounds float* %tmp16599, i64 1
+  %tmp16601 = getelementptr inbounds float* %tmp16600, i64 1
+  %tmp16602 = getelementptr inbounds float* %tmp16601, i64 1
+  %tmp16603 = getelementptr inbounds float* %tmp16602, i64 1
+  %tmp16604 = getelementptr inbounds float* %tmp16603, i64 1
+  %tmp16605 = getelementptr inbounds float* %tmp16604, i64 1
+  %tmp16606 = getelementptr inbounds float* %tmp16605, i64 1
+  %tmp16607 = getelementptr inbounds float* %tmp16606, i64 1
+  %tmp16608 = getelementptr inbounds float* %tmp16607, i64 1
+  %tmp16609 = getelementptr inbounds float* %tmp16608, i64 1
+  %tmp16610 = getelementptr inbounds float* %tmp16609, i64 1
+  %tmp16611 = getelementptr inbounds float* %tmp16610, i64 1
+  %tmp16612 = getelementptr inbounds float* %tmp16611, i64 1
+  %tmp16613 = getelementptr inbounds float* %tmp16612, i64 1
+  %tmp16614 = getelementptr inbounds float* %tmp16613, i64 1
+  %tmp16615 = getelementptr inbounds float* %tmp16614, i64 1
+  %tmp16616 = getelementptr inbounds float* %tmp16615, i64 1
+  %tmp16617 = getelementptr inbounds float* %tmp16616, i64 1
+  %tmp16618 = getelementptr inbounds float* %tmp16617, i64 1
+  %tmp16619 = getelementptr inbounds float* %tmp16618, i64 1
+  %tmp16620 = getelementptr inbounds float* %tmp16619, i64 1
+  %tmp16621 = getelementptr inbounds float* %tmp16620, i64 1
+  %tmp16622 = getelementptr inbounds float* %tmp16621, i64 1
+  %tmp16623 = getelementptr inbounds float* %tmp16622, i64 1
+  %tmp16624 = getelementptr inbounds float* %tmp16623, i64 1
+  %tmp16625 = getelementptr inbounds float* %tmp16624, i64 1
+  %tmp16626 = getelementptr inbounds float* %tmp16625, i64 1
+  %tmp16627 = getelementptr inbounds float* %tmp16626, i64 1
+  %tmp16628 = getelementptr inbounds float* %tmp16627, i64 1
+  %tmp16629 = getelementptr inbounds float* %tmp16628, i64 1
+  %tmp16630 = getelementptr inbounds float* %tmp16629, i64 1
+  %tmp16631 = getelementptr inbounds float* %tmp16630, i64 1
+  %tmp16632 = getelementptr inbounds float* %tmp16631, i64 1
+  %tmp16633 = getelementptr inbounds float* %tmp16632, i64 1
+  %tmp16634 = getelementptr inbounds float* %tmp16633, i64 1
+  %tmp16635 = getelementptr inbounds float* %tmp16634, i64 1
+  %tmp16636 = getelementptr inbounds float* %tmp16635, i64 1
+  %tmp16637 = getelementptr inbounds float* %tmp16636, i64 1
+  %tmp16638 = getelementptr inbounds float* %tmp16637, i64 1
+  %tmp16639 = getelementptr inbounds float* %tmp16638, i64 1
+  %tmp16640 = getelementptr inbounds float* %tmp16639, i64 1
+  %tmp16641 = getelementptr inbounds float* %tmp16640, i64 1
+  %tmp16642 = getelementptr inbounds float* %tmp16641, i64 1
+  %tmp16643 = getelementptr inbounds float* %tmp16642, i64 1
+  %tmp16644 = getelementptr inbounds float* %tmp16643, i64 1
+  %tmp16645 = getelementptr inbounds float* %tmp16644, i64 1
+  %tmp16646 = getelementptr inbounds float* %tmp16645, i64 1
+  %tmp16647 = getelementptr inbounds float* %tmp16646, i64 1
+  %tmp16648 = getelementptr inbounds float* %tmp16647, i64 1
+  %tmp16649 = getelementptr inbounds float* %tmp16648, i64 1
+  %tmp16650 = getelementptr inbounds float* %tmp16649, i64 1
+  %tmp16651 = getelementptr inbounds float* %tmp16650, i64 1
+  %tmp16652 = getelementptr inbounds float* %tmp16651, i64 1
+  %tmp16653 = getelementptr inbounds float* %tmp16652, i64 1
+  %tmp16654 = getelementptr inbounds float* %tmp16653, i64 1
+  %tmp16655 = getelementptr inbounds float* %tmp16654, i64 1
+  %tmp16656 = getelementptr inbounds float* %tmp16655, i64 1
+  %tmp16657 = getelementptr inbounds float* %tmp16656, i64 1
+  %tmp16658 = getelementptr inbounds float* %tmp16657, i64 1
+  %tmp16659 = getelementptr inbounds float* %tmp16658, i64 1
+  %tmp16660 = getelementptr inbounds float* %tmp16659, i64 1
+  %tmp16661 = getelementptr inbounds float* %tmp16660, i64 1
+  %tmp16662 = getelementptr inbounds float* %tmp16661, i64 1
+  %tmp16663 = getelementptr inbounds float* %tmp16662, i64 1
+  %tmp16664 = getelementptr inbounds float* %tmp16663, i64 1
+  %tmp16665 = getelementptr inbounds float* %tmp16664, i64 1
+  %tmp16666 = getelementptr inbounds float* %tmp16665, i64 1
+  %tmp16667 = getelementptr inbounds float* %tmp16666, i64 1
+  %tmp16668 = getelementptr inbounds float* %tmp16667, i64 1
+  %tmp16669 = getelementptr inbounds float* %tmp16668, i64 1
+  %tmp16670 = getelementptr inbounds float* %tmp16669, i64 1
+  %tmp16671 = getelementptr inbounds float* %tmp16670, i64 1
+  %tmp16672 = getelementptr inbounds float* %tmp16671, i64 1
+  %tmp16673 = getelementptr inbounds float* %tmp16672, i64 1
+  %tmp16674 = getelementptr inbounds float* %tmp16673, i64 1
+  %tmp16675 = getelementptr inbounds float* %tmp16674, i64 1
+  %tmp16676 = getelementptr inbounds float* %tmp16675, i64 1
+  %tmp16677 = getelementptr inbounds float* %tmp16676, i64 1
+  %tmp16678 = getelementptr inbounds float* %tmp16677, i64 1
+  %tmp16679 = getelementptr inbounds float* %tmp16678, i64 1
+  %tmp16680 = getelementptr inbounds float* %tmp16679, i64 1
+  %tmp16681 = getelementptr inbounds float* %tmp16680, i64 1
+  %tmp16682 = getelementptr inbounds float* %tmp16681, i64 1
+  %tmp16683 = getelementptr inbounds float* %tmp16682, i64 1
+  %tmp16684 = getelementptr inbounds float* %tmp16683, i64 1
+  %tmp16685 = getelementptr inbounds float* %tmp16684, i64 1
+  %tmp16686 = getelementptr inbounds float* %tmp16685, i64 1
+  %tmp16687 = getelementptr inbounds float* %tmp16686, i64 1
+  %tmp16688 = getelementptr inbounds float* %tmp16687, i64 1
+  %tmp16689 = getelementptr inbounds float* %tmp16688, i64 1
+  %tmp16690 = getelementptr inbounds float* %tmp16689, i64 1
+  %tmp16691 = getelementptr inbounds float* %tmp16690, i64 1
+  %tmp16692 = getelementptr inbounds float* %tmp16691, i64 1
+  %tmp16693 = getelementptr inbounds float* %tmp16692, i64 1
+  %tmp16694 = getelementptr inbounds float* %tmp16693, i64 1
+  %tmp16695 = getelementptr inbounds float* %tmp16694, i64 1
+  %tmp16696 = getelementptr inbounds float* %tmp16695, i64 1
+  %tmp16697 = getelementptr inbounds float* %tmp16696, i64 1
+  %tmp16698 = getelementptr inbounds float* %tmp16697, i64 1
+  %tmp16699 = getelementptr inbounds float* %tmp16698, i64 1
+  %tmp16700 = getelementptr inbounds float* %tmp16699, i64 1
+  %tmp16701 = getelementptr inbounds float* %tmp16700, i64 1
+  %tmp16702 = getelementptr inbounds float* %tmp16701, i64 1
+  %tmp16703 = getelementptr inbounds float* %tmp16702, i64 1
+  %tmp16704 = getelementptr inbounds float* %tmp16703, i64 1
+  %tmp16705 = getelementptr inbounds float* %tmp16704, i64 1
+  %tmp16706 = getelementptr inbounds float* %tmp16705, i64 1
+  %tmp16707 = getelementptr inbounds float* %tmp16706, i64 1
+  %tmp16708 = getelementptr inbounds float* %tmp16707, i64 1
+  %tmp16709 = getelementptr inbounds float* %tmp16708, i64 1
+  %tmp16710 = getelementptr inbounds float* %tmp16709, i64 1
+  %tmp16711 = getelementptr inbounds float* %tmp16710, i64 1
+  %tmp16712 = getelementptr inbounds float* %tmp16711, i64 1
+  %tmp16713 = getelementptr inbounds float* %tmp16712, i64 1
+  %tmp16714 = getelementptr inbounds float* %tmp16713, i64 1
+  %tmp16715 = getelementptr inbounds float* %tmp16714, i64 1
+  %tmp16716 = getelementptr inbounds float* %tmp16715, i64 1
+  %tmp16717 = getelementptr inbounds float* %tmp16716, i64 1
+  %tmp16718 = getelementptr inbounds float* %tmp16717, i64 1
+  %tmp16719 = getelementptr inbounds float* %tmp16718, i64 1
+  %tmp16720 = getelementptr inbounds float* %tmp16719, i64 1
+  %tmp16721 = getelementptr inbounds float* %tmp16720, i64 1
+  %tmp16722 = getelementptr inbounds float* %tmp16721, i64 1
+  %tmp16723 = getelementptr inbounds float* %tmp16722, i64 1
+  %tmp16724 = getelementptr inbounds float* %tmp16723, i64 1
+  %tmp16725 = getelementptr inbounds float* %tmp16724, i64 1
+  %tmp16726 = getelementptr inbounds float* %tmp16725, i64 1
+  %tmp16727 = getelementptr inbounds float* %tmp16726, i64 1
+  %tmp16728 = getelementptr inbounds float* %tmp16727, i64 1
+  %tmp16729 = getelementptr inbounds float* %tmp16728, i64 1
+  %tmp16730 = getelementptr inbounds float* %tmp16729, i64 1
+  %tmp16731 = getelementptr inbounds float* %tmp16730, i64 1
+  %tmp16732 = getelementptr inbounds float* %tmp16731, i64 1
+  %tmp16733 = getelementptr inbounds float* %tmp16732, i64 1
+  %tmp16734 = getelementptr inbounds float* %tmp16733, i64 1
+  %tmp16735 = getelementptr inbounds float* %tmp16734, i64 1
+  %tmp16736 = getelementptr inbounds float* %tmp16735, i64 1
+  %tmp16737 = getelementptr inbounds float* %tmp16736, i64 1
+  %tmp16738 = getelementptr inbounds float* %tmp16737, i64 1
+  %tmp16739 = getelementptr inbounds float* %tmp16738, i64 1
+  %tmp16740 = getelementptr inbounds float* %tmp16739, i64 1
+  %tmp16741 = getelementptr inbounds float* %tmp16740, i64 1
+  %tmp16742 = getelementptr inbounds float* %tmp16741, i64 1
+  %tmp16743 = getelementptr inbounds float* %tmp16742, i64 1
+  %tmp16744 = getelementptr inbounds float* %tmp16743, i64 1
+  %tmp16745 = getelementptr inbounds float* %tmp16744, i64 1
+  %tmp16746 = getelementptr inbounds float* %tmp16745, i64 1
+  %tmp16747 = getelementptr inbounds float* %tmp16746, i64 1
+  %tmp16748 = getelementptr inbounds float* %tmp16747, i64 1
+  %tmp16749 = getelementptr inbounds float* %tmp16748, i64 1
+  %tmp16750 = getelementptr inbounds float* %tmp16749, i64 1
+  %tmp16751 = getelementptr inbounds float* %tmp16750, i64 1
+  %tmp16752 = getelementptr inbounds float* %tmp16751, i64 1
+  %tmp16753 = getelementptr inbounds float* %tmp16752, i64 1
+  %tmp16754 = getelementptr inbounds float* %tmp16753, i64 1
+  %tmp16755 = getelementptr inbounds float* %tmp16754, i64 1
+  %tmp16756 = getelementptr inbounds float* %tmp16755, i64 1
+  %tmp16757 = getelementptr inbounds float* %tmp16756, i64 1
+  %tmp16758 = getelementptr inbounds float* %tmp16757, i64 1
+  %tmp16759 = getelementptr inbounds float* %tmp16758, i64 1
+  %tmp16760 = getelementptr inbounds float* %tmp16759, i64 1
+  %tmp16761 = getelementptr inbounds float* %tmp16760, i64 1
+  %tmp16762 = getelementptr inbounds float* %tmp16761, i64 1
+  %tmp16763 = getelementptr inbounds float* %tmp16762, i64 1
+  %tmp16764 = getelementptr inbounds float* %tmp16763, i64 1
+  %tmp16765 = getelementptr inbounds float* %tmp16764, i64 1
+  %tmp16766 = getelementptr inbounds float* %tmp16765, i64 1
+  %tmp16767 = getelementptr inbounds float* %tmp16766, i64 1
+  %tmp16768 = getelementptr inbounds float* %tmp16767, i64 1
+  %tmp16769 = getelementptr inbounds float* %tmp16768, i64 1
+  %tmp16770 = getelementptr inbounds float* %tmp16769, i64 1
+  %tmp16771 = getelementptr inbounds float* %tmp16770, i64 1
+  %tmp16772 = getelementptr inbounds float* %tmp16771, i64 1
+  %tmp16773 = getelementptr inbounds float* %tmp16772, i64 1
+  %tmp16774 = getelementptr inbounds float* %tmp16773, i64 1
+  %tmp16775 = getelementptr inbounds float* %tmp16774, i64 1
+  %tmp16776 = getelementptr inbounds float* %tmp16775, i64 1
+  %tmp16777 = getelementptr inbounds float* %tmp16776, i64 1
+  %tmp16778 = getelementptr inbounds float* %tmp16777, i64 1
+  %tmp16779 = getelementptr inbounds float* %tmp16778, i64 1
+  %tmp16780 = getelementptr inbounds float* %tmp16779, i64 1
+  %tmp16781 = getelementptr inbounds float* %tmp16780, i64 1
+  %tmp16782 = getelementptr inbounds float* %tmp16781, i64 1
+  %tmp16783 = getelementptr inbounds float* %tmp16782, i64 1
+  %tmp16784 = getelementptr inbounds float* %tmp16783, i64 1
+  %tmp16785 = getelementptr inbounds float* %tmp16784, i64 1
+  %tmp16786 = getelementptr inbounds float* %tmp16785, i64 1
+  %tmp16787 = getelementptr inbounds float* %tmp16786, i64 1
+  %tmp16788 = getelementptr inbounds float* %tmp16787, i64 1
+  %tmp16789 = getelementptr inbounds float* %tmp16788, i64 1
+  %tmp16790 = getelementptr inbounds float* %tmp16789, i64 1
+  %tmp16791 = getelementptr inbounds float* %tmp16790, i64 1
+  %tmp16792 = getelementptr inbounds float* %tmp16791, i64 1
+  %tmp16793 = getelementptr inbounds float* %tmp16792, i64 1
+  %tmp16794 = getelementptr inbounds float* %tmp16793, i64 1
+  %tmp16795 = getelementptr inbounds float* %tmp16794, i64 1
+  %tmp16796 = getelementptr inbounds float* %tmp16795, i64 1
+  %tmp16797 = getelementptr inbounds float* %tmp16796, i64 1
+  %tmp16798 = getelementptr inbounds float* %tmp16797, i64 1
+  %tmp16799 = getelementptr inbounds float* %tmp16798, i64 1
+  %tmp16800 = getelementptr inbounds float* %tmp16799, i64 1
+  %tmp16801 = getelementptr inbounds float* %tmp16800, i64 1
+  %tmp16802 = getelementptr inbounds float* %tmp16801, i64 1
+  %tmp16803 = getelementptr inbounds float* %tmp16802, i64 1
+  %tmp16804 = getelementptr inbounds float* %tmp16803, i64 1
+  %tmp16805 = getelementptr inbounds float* %tmp16804, i64 1
+  %tmp16806 = getelementptr inbounds float* %tmp16805, i64 1
+  %tmp16807 = getelementptr inbounds float* %tmp16806, i64 1
+  %tmp16808 = getelementptr inbounds float* %tmp16807, i64 1
+  %tmp16809 = getelementptr inbounds float* %tmp16808, i64 1
+  %tmp16810 = getelementptr inbounds float* %tmp16809, i64 1
+  %tmp16811 = getelementptr inbounds float* %tmp16810, i64 1
+  %tmp16812 = getelementptr inbounds float* %tmp16811, i64 1
+  %tmp16813 = getelementptr inbounds float* %tmp16812, i64 1
+  %tmp16814 = getelementptr inbounds float* %tmp16813, i64 1
+  %tmp16815 = getelementptr inbounds float* %tmp16814, i64 1
+  %tmp16816 = getelementptr inbounds float* %tmp16815, i64 1
+  %tmp16817 = getelementptr inbounds float* %tmp16816, i64 1
+  %tmp16818 = getelementptr inbounds float* %tmp16817, i64 1
+  %tmp16819 = getelementptr inbounds float* %tmp16818, i64 1
+  %tmp16820 = getelementptr inbounds float* %tmp16819, i64 1
+  %tmp16821 = getelementptr inbounds float* %tmp16820, i64 1
+  %tmp16822 = getelementptr inbounds float* %tmp16821, i64 1
+  %tmp16823 = getelementptr inbounds float* %tmp16822, i64 1
+  %tmp16824 = getelementptr inbounds float* %tmp16823, i64 1
+  %tmp16825 = getelementptr inbounds float* %tmp16824, i64 1
+  %tmp16826 = getelementptr inbounds float* %tmp16825, i64 1
+  %tmp16827 = getelementptr inbounds float* %tmp16826, i64 1
+  %tmp16828 = getelementptr inbounds float* %tmp16827, i64 1
+  %tmp16829 = getelementptr inbounds float* %tmp16828, i64 1
+  %tmp16830 = getelementptr inbounds float* %tmp16829, i64 1
+  %tmp16831 = getelementptr inbounds float* %tmp16830, i64 1
+  %tmp16832 = getelementptr inbounds float* %tmp16831, i64 1
+  %tmp16833 = getelementptr inbounds float* %tmp16832, i64 1
+  %tmp16834 = getelementptr inbounds float* %tmp16833, i64 1
+  %tmp16835 = getelementptr inbounds float* %tmp16834, i64 1
+  %tmp16836 = getelementptr inbounds float* %tmp16835, i64 1
+  %tmp16837 = getelementptr inbounds float* %tmp16836, i64 1
+  %tmp16838 = getelementptr inbounds float* %tmp16837, i64 1
+  %tmp16839 = getelementptr inbounds float* %tmp16838, i64 1
+  %tmp16840 = getelementptr inbounds float* %tmp16839, i64 1
+  %tmp16841 = getelementptr inbounds float* %tmp16840, i64 1
+  %tmp16842 = getelementptr inbounds float* %tmp16841, i64 1
+  %tmp16843 = getelementptr inbounds float* %tmp16842, i64 1
+  %tmp16844 = getelementptr inbounds float* %tmp16843, i64 1
+  %tmp16845 = getelementptr inbounds float* %tmp16844, i64 1
+  %tmp16846 = getelementptr inbounds float* %tmp16845, i64 1
+  %tmp16847 = getelementptr inbounds float* %tmp16846, i64 1
+  %tmp16848 = getelementptr inbounds float* %tmp16847, i64 1
+  %tmp16849 = getelementptr inbounds float* %tmp16848, i64 1
+  %tmp16850 = getelementptr inbounds float* %tmp16849, i64 1
+  %tmp16851 = getelementptr inbounds float* %tmp16850, i64 1
+  %tmp16852 = getelementptr inbounds float* %tmp16851, i64 1
+  %tmp16853 = getelementptr inbounds float* %tmp16852, i64 1
+  %tmp16854 = getelementptr inbounds float* %tmp16853, i64 1
+  %tmp16855 = getelementptr inbounds float* %tmp16854, i64 1
+  %tmp16856 = getelementptr inbounds float* %tmp16855, i64 1
+  %tmp16857 = getelementptr inbounds float* %tmp16856, i64 1
+  %tmp16858 = getelementptr inbounds float* %tmp16857, i64 1
+  %tmp16859 = getelementptr inbounds float* %tmp16858, i64 1
+  %tmp16860 = getelementptr inbounds float* %tmp16859, i64 1
+  %tmp16861 = getelementptr inbounds float* %tmp16860, i64 1
+  %tmp16862 = getelementptr inbounds float* %tmp16861, i64 1
+  %tmp16863 = getelementptr inbounds float* %tmp16862, i64 1
+  %tmp16864 = getelementptr inbounds float* %tmp16863, i64 1
+  %tmp16865 = getelementptr inbounds float* %tmp16864, i64 1
+  %tmp16866 = getelementptr inbounds float* %tmp16865, i64 1
+  %tmp16867 = getelementptr inbounds float* %tmp16866, i64 1
+  %tmp16868 = getelementptr inbounds float* %tmp16867, i64 1
+  %tmp16869 = getelementptr inbounds float* %tmp16868, i64 1
+  %tmp16870 = getelementptr inbounds float* %tmp16869, i64 1
+  %tmp16871 = getelementptr inbounds float* %tmp16870, i64 1
+  %tmp16872 = getelementptr inbounds float* %tmp16871, i64 1
+  %tmp16873 = getelementptr inbounds float* %tmp16872, i64 1
+  %tmp16874 = getelementptr inbounds float* %tmp16873, i64 1
+  %tmp16875 = getelementptr inbounds float* %tmp16874, i64 1
+  %tmp16876 = getelementptr inbounds float* %tmp16875, i64 1
+  %tmp16877 = getelementptr inbounds float* %tmp16876, i64 1
+  %tmp16878 = getelementptr inbounds float* %tmp16877, i64 1
+  %tmp16879 = getelementptr inbounds float* %tmp16878, i64 1
+  %tmp16880 = getelementptr inbounds float* %tmp16879, i64 1
+  %tmp16881 = getelementptr inbounds float* %tmp16880, i64 1
+  %tmp16882 = getelementptr inbounds float* %tmp16881, i64 1
+  %tmp16883 = getelementptr inbounds float* %tmp16882, i64 1
+  %tmp16884 = getelementptr inbounds float* %tmp16883, i64 1
+  %tmp16885 = getelementptr inbounds float* %tmp16884, i64 1
+  %tmp16886 = getelementptr inbounds float* %tmp16885, i64 1
+  %tmp16887 = getelementptr inbounds float* %tmp16886, i64 1
+  %tmp16888 = getelementptr inbounds float* %tmp16887, i64 1
+  %tmp16889 = getelementptr inbounds float* %tmp16888, i64 1
+  %tmp16890 = getelementptr inbounds float* %tmp16889, i64 1
+  %tmp16891 = getelementptr inbounds float* %tmp16890, i64 1
+  %tmp16892 = getelementptr inbounds float* %tmp16891, i64 1
+  %tmp16893 = getelementptr inbounds float* %tmp16892, i64 1
+  %tmp16894 = getelementptr inbounds float* %tmp16893, i64 1
+  %tmp16895 = getelementptr inbounds float* %tmp16894, i64 1
+  %tmp16896 = getelementptr inbounds float* %tmp16895, i64 1
+  %tmp16897 = getelementptr inbounds float* %tmp16896, i64 1
+  %tmp16898 = getelementptr inbounds float* %tmp16897, i64 1
+  %tmp16899 = getelementptr inbounds float* %tmp16898, i64 1
+  %tmp16900 = getelementptr inbounds float* %tmp16899, i64 1
+  %tmp16901 = getelementptr inbounds float* %tmp16900, i64 1
+  %tmp16902 = getelementptr inbounds float* %tmp16901, i64 1
+  %tmp16903 = getelementptr inbounds float* %tmp16902, i64 1
+  %tmp16904 = getelementptr inbounds float* %tmp16903, i64 1
+  %tmp16905 = getelementptr inbounds float* %tmp16904, i64 1
+  %tmp16906 = getelementptr inbounds float* %tmp16905, i64 1
+  %tmp16907 = getelementptr inbounds float* %tmp16906, i64 1
+  %tmp16908 = getelementptr inbounds float* %tmp16907, i64 1
+  %tmp16909 = getelementptr inbounds float* %tmp16908, i64 1
+  %tmp16910 = getelementptr inbounds float* %tmp16909, i64 1
+  %tmp16911 = getelementptr inbounds float* %tmp16910, i64 1
+  %tmp16912 = getelementptr inbounds float* %tmp16911, i64 1
+  %tmp16913 = getelementptr inbounds float* %tmp16912, i64 1
+  %tmp16914 = getelementptr inbounds float* %tmp16913, i64 1
+  %tmp16915 = getelementptr inbounds float* %tmp16914, i64 1
+  %tmp16916 = getelementptr inbounds float* %tmp16915, i64 1
+  %tmp16917 = getelementptr inbounds float* %tmp16916, i64 1
+  %tmp16918 = getelementptr inbounds float* %tmp16917, i64 1
+  %tmp16919 = getelementptr inbounds float* %tmp16918, i64 1
+  %tmp16920 = getelementptr inbounds float* %tmp16919, i64 1
+  %tmp16921 = getelementptr inbounds float* %tmp16920, i64 1
+  %tmp16922 = getelementptr inbounds float* %tmp16921, i64 1
+  %tmp16923 = getelementptr inbounds float* %tmp16922, i64 1
+  %tmp16924 = getelementptr inbounds float* %tmp16923, i64 1
+  %tmp16925 = getelementptr inbounds float* %tmp16924, i64 1
+  %tmp16926 = getelementptr inbounds float* %tmp16925, i64 1
+  %tmp16927 = getelementptr inbounds float* %tmp16926, i64 1
+  %tmp16928 = getelementptr inbounds float* %tmp16927, i64 1
+  %tmp16929 = getelementptr inbounds float* %tmp16928, i64 1
+  %tmp16930 = getelementptr inbounds float* %tmp16929, i64 1
+  %tmp16931 = getelementptr inbounds float* %tmp16930, i64 1
+  %tmp16932 = getelementptr inbounds float* %tmp16931, i64 1
+  %tmp16933 = getelementptr inbounds float* %tmp16932, i64 1
+  %tmp16934 = getelementptr inbounds float* %tmp16933, i64 1
+  %tmp16935 = getelementptr inbounds float* %tmp16934, i64 1
+  %tmp16936 = getelementptr inbounds float* %tmp16935, i64 1
+  %tmp16937 = getelementptr inbounds float* %tmp16936, i64 1
+  %tmp16938 = getelementptr inbounds float* %tmp16937, i64 1
+  %tmp16939 = getelementptr inbounds float* %tmp16938, i64 1
+  %tmp16940 = getelementptr inbounds float* %tmp16939, i64 1
+  %tmp16941 = getelementptr inbounds float* %tmp16940, i64 1
+  %tmp16942 = getelementptr inbounds float* %tmp16941, i64 1
+  %tmp16943 = getelementptr inbounds float* %tmp16942, i64 1
+  %tmp16944 = getelementptr inbounds float* %tmp16943, i64 1
+  %tmp16945 = getelementptr inbounds float* %tmp16944, i64 1
+  %tmp16946 = getelementptr inbounds float* %tmp16945, i64 1
+  %tmp16947 = getelementptr inbounds float* %tmp16946, i64 1
+  %tmp16948 = getelementptr inbounds float* %tmp16947, i64 1
+  %tmp16949 = getelementptr inbounds float* %tmp16948, i64 1
+  %tmp16950 = getelementptr inbounds float* %tmp16949, i64 1
+  %tmp16951 = getelementptr inbounds float* %tmp16950, i64 1
+  %tmp16952 = getelementptr inbounds float* %tmp16951, i64 1
+  %tmp16953 = getelementptr inbounds float* %tmp16952, i64 1
+  %tmp16954 = getelementptr inbounds float* %tmp16953, i64 1
+  %tmp16955 = getelementptr inbounds float* %tmp16954, i64 1
+  %tmp16956 = getelementptr inbounds float* %tmp16955, i64 1
+  %tmp16957 = getelementptr inbounds float* %tmp16956, i64 1
+  %tmp16958 = getelementptr inbounds float* %tmp16957, i64 1
+  %tmp16959 = getelementptr inbounds float* %tmp16958, i64 1
+  %tmp16960 = getelementptr inbounds float* %tmp16959, i64 1
+  %tmp16961 = getelementptr inbounds float* %tmp16960, i64 1
+  %tmp16962 = getelementptr inbounds float* %tmp16961, i64 1
+  %tmp16963 = getelementptr inbounds float* %tmp16962, i64 1
+  %tmp16964 = getelementptr inbounds float* %tmp16963, i64 1
+  %tmp16965 = getelementptr inbounds float* %tmp16964, i64 1
+  %tmp16966 = getelementptr inbounds float* %tmp16965, i64 1
+  %tmp16967 = getelementptr inbounds float* %tmp16966, i64 1
+  %tmp16968 = getelementptr inbounds float* %tmp16967, i64 1
+  %tmp16969 = getelementptr inbounds float* %tmp16968, i64 1
+  %tmp16970 = getelementptr inbounds float* %tmp16969, i64 1
+  %tmp16971 = getelementptr inbounds float* %tmp16970, i64 1
+  %tmp16972 = getelementptr inbounds float* %tmp16971, i64 1
+  %tmp16973 = getelementptr inbounds float* %tmp16972, i64 1
+  %tmp16974 = getelementptr inbounds float* %tmp16973, i64 1
+  %tmp16975 = getelementptr inbounds float* %tmp16974, i64 1
+  %tmp16976 = getelementptr inbounds float* %tmp16975, i64 1
+  %tmp16977 = getelementptr inbounds float* %tmp16976, i64 1
+  %tmp16978 = getelementptr inbounds float* %tmp16977, i64 1
+  %tmp16979 = getelementptr inbounds float* %tmp16978, i64 1
+  %tmp16980 = getelementptr inbounds float* %tmp16979, i64 1
+  %tmp16981 = getelementptr inbounds float* %tmp16980, i64 1
+  %tmp16982 = getelementptr inbounds float* %tmp16981, i64 1
+  %tmp16983 = getelementptr inbounds float* %tmp16982, i64 1
+  %tmp16984 = getelementptr inbounds float* %tmp16983, i64 1
+  %tmp16985 = getelementptr inbounds float* %tmp16984, i64 1
+  %tmp16986 = getelementptr inbounds float* %tmp16985, i64 1
+  %tmp16987 = getelementptr inbounds float* %tmp16986, i64 1
+  %tmp16988 = getelementptr inbounds float* %tmp16987, i64 1
+  %tmp16989 = getelementptr inbounds float* %tmp16988, i64 1
+  %tmp16990 = getelementptr inbounds float* %tmp16989, i64 1
+  %tmp16991 = getelementptr inbounds float* %tmp16990, i64 1
+  %tmp16992 = getelementptr inbounds float* %tmp16991, i64 1
+  %tmp16993 = getelementptr inbounds float* %tmp16992, i64 1
+  %tmp16994 = getelementptr inbounds float* %tmp16993, i64 1
+  %tmp16995 = getelementptr inbounds float* %tmp16994, i64 1
+  %tmp16996 = getelementptr inbounds float* %tmp16995, i64 1
+  %tmp16997 = getelementptr inbounds float* %tmp16996, i64 1
+  %tmp16998 = getelementptr inbounds float* %tmp16997, i64 1
+  %tmp16999 = getelementptr inbounds float* %tmp16998, i64 1
+  %tmp17000 = getelementptr inbounds float* %tmp16999, i64 1
+  %tmp17001 = getelementptr inbounds float* %tmp17000, i64 1
+  %tmp17002 = getelementptr inbounds float* %tmp17001, i64 1
+  %tmp17003 = getelementptr inbounds float* %tmp17002, i64 1
+  %tmp17004 = getelementptr inbounds float* %tmp17003, i64 1
+  %tmp17005 = getelementptr inbounds float* %tmp17004, i64 1
+  %tmp17006 = getelementptr inbounds float* %tmp17005, i64 1
+  %tmp17007 = getelementptr inbounds float* %tmp17006, i64 1
+  %tmp17008 = getelementptr inbounds float* %tmp17007, i64 1
+  %tmp17009 = getelementptr inbounds float* %tmp17008, i64 1
+  %tmp17010 = getelementptr inbounds float* %tmp17009, i64 1
+  %tmp17011 = getelementptr inbounds float* %tmp17010, i64 1
+  %tmp17012 = getelementptr inbounds float* %tmp17011, i64 1
+  %tmp17013 = getelementptr inbounds float* %tmp17012, i64 1
+  %tmp17014 = getelementptr inbounds float* %tmp17013, i64 1
+  %tmp17015 = getelementptr inbounds float* %tmp17014, i64 1
+  %tmp17016 = getelementptr inbounds float* %tmp17015, i64 1
+  %tmp17017 = getelementptr inbounds float* %tmp17016, i64 1
+  %tmp17018 = getelementptr inbounds float* %tmp17017, i64 1
+  %tmp17019 = getelementptr inbounds float* %tmp17018, i64 1
+  %tmp17020 = getelementptr inbounds float* %tmp17019, i64 1
+  %tmp17021 = getelementptr inbounds float* %tmp17020, i64 1
+  %tmp17022 = getelementptr inbounds float* %tmp17021, i64 1
+  %tmp17023 = getelementptr inbounds float* %tmp17022, i64 1
+  %tmp17024 = getelementptr inbounds float* %tmp17023, i64 1
+  %tmp17025 = getelementptr inbounds float* %tmp17024, i64 1
+  %tmp17026 = getelementptr inbounds float* %tmp17025, i64 1
+  %tmp17027 = getelementptr inbounds float* %tmp17026, i64 1
+  %tmp17028 = getelementptr inbounds float* %tmp17027, i64 1
+  %tmp17029 = getelementptr inbounds float* %tmp17028, i64 1
+  %tmp17030 = getelementptr inbounds float* %tmp17029, i64 1
+  %tmp17031 = getelementptr inbounds float* %tmp17030, i64 1
+  %tmp17032 = getelementptr inbounds float* %tmp17031, i64 1
+  %tmp17033 = getelementptr inbounds float* %tmp17032, i64 1
+  %tmp17034 = getelementptr inbounds float* %tmp17033, i64 1
+  %tmp17035 = getelementptr inbounds float* %tmp17034, i64 1
+  %tmp17036 = getelementptr inbounds float* %tmp17035, i64 1
+  %tmp17037 = getelementptr inbounds float* %tmp17036, i64 1
+  %tmp17038 = getelementptr inbounds float* %tmp17037, i64 1
+  %tmp17039 = getelementptr inbounds float* %tmp17038, i64 1
+  %tmp17040 = getelementptr inbounds float* %tmp17039, i64 1
+  %tmp17041 = getelementptr inbounds float* %tmp17040, i64 1
+  %tmp17042 = getelementptr inbounds float* %tmp17041, i64 1
+  %tmp17043 = getelementptr inbounds float* %tmp17042, i64 1
+  %tmp17044 = getelementptr inbounds float* %tmp17043, i64 1
+  %tmp17045 = getelementptr inbounds float* %tmp17044, i64 1
+  %tmp17046 = getelementptr inbounds float* %tmp17045, i64 1
+  %tmp17047 = getelementptr inbounds float* %tmp17046, i64 1
+  %tmp17048 = getelementptr inbounds float* %tmp17047, i64 1
+  %tmp17049 = getelementptr inbounds float* %tmp17048, i64 1
+  %tmp17050 = getelementptr inbounds float* %tmp17049, i64 1
+  %tmp17051 = getelementptr inbounds float* %tmp17050, i64 1
+  %tmp17052 = getelementptr inbounds float* %tmp17051, i64 1
+  %tmp17053 = getelementptr inbounds float* %tmp17052, i64 1
+  %tmp17054 = getelementptr inbounds float* %tmp17053, i64 1
+  %tmp17055 = getelementptr inbounds float* %tmp17054, i64 1
+  %tmp17056 = getelementptr inbounds float* %tmp17055, i64 1
+  %tmp17057 = getelementptr inbounds float* %tmp17056, i64 1
+  %tmp17058 = getelementptr inbounds float* %tmp17057, i64 1
+  %tmp17059 = getelementptr inbounds float* %tmp17058, i64 1
+  %tmp17060 = getelementptr inbounds float* %tmp17059, i64 1
+  %tmp17061 = getelementptr inbounds float* %tmp17060, i64 1
+  %tmp17062 = getelementptr inbounds float* %tmp17061, i64 1
+  %tmp17063 = getelementptr inbounds float* %tmp17062, i64 1
+  %tmp17064 = getelementptr inbounds float* %tmp17063, i64 1
+  %tmp17065 = getelementptr inbounds float* %tmp17064, i64 1
+  %tmp17066 = getelementptr inbounds float* %tmp17065, i64 1
+  %tmp17067 = getelementptr inbounds float* %tmp17066, i64 1
+  %tmp17068 = getelementptr inbounds float* %tmp17067, i64 1
+  %tmp17069 = getelementptr inbounds float* %tmp17068, i64 1
+  %tmp17070 = getelementptr inbounds float* %tmp17069, i64 1
+  %tmp17071 = getelementptr inbounds float* %tmp17070, i64 1
+  %tmp17072 = getelementptr inbounds float* %tmp17071, i64 1
+  %tmp17073 = getelementptr inbounds float* %tmp17072, i64 1
+  %tmp17074 = getelementptr inbounds float* %tmp17073, i64 1
+  %tmp17075 = getelementptr inbounds float* %tmp17074, i64 1
+  %tmp17076 = getelementptr inbounds float* %tmp17075, i64 1
+  %tmp17077 = getelementptr inbounds float* %tmp17076, i64 1
+  %tmp17078 = getelementptr inbounds float* %tmp17077, i64 1
+  %tmp17079 = getelementptr inbounds float* %tmp17078, i64 1
+  %tmp17080 = getelementptr inbounds float* %tmp17079, i64 1
+  %tmp17081 = getelementptr inbounds float* %tmp17080, i64 1
+  %tmp17082 = getelementptr inbounds float* %tmp17081, i64 1
+  %tmp17083 = getelementptr inbounds float* %tmp17082, i64 1
+  %tmp17084 = getelementptr inbounds float* %tmp17083, i64 1
+  %tmp17085 = getelementptr inbounds float* %tmp17084, i64 1
+  %tmp17086 = getelementptr inbounds float* %tmp17085, i64 1
+  %tmp17087 = getelementptr inbounds float* %tmp17086, i64 1
+  %tmp17088 = getelementptr inbounds float* %tmp17087, i64 1
+  %tmp17089 = getelementptr inbounds float* %tmp17088, i64 1
+  %tmp17090 = getelementptr inbounds float* %tmp17089, i64 1
+  %tmp17091 = getelementptr inbounds float* %tmp17090, i64 1
+  %tmp17092 = getelementptr inbounds float* %tmp17091, i64 1
+  %tmp17093 = getelementptr inbounds float* %tmp17092, i64 1
+  %tmp17094 = getelementptr inbounds float* %tmp17093, i64 1
+  %tmp17095 = getelementptr inbounds float* %tmp17094, i64 1
+  %tmp17096 = getelementptr inbounds float* %tmp17095, i64 1
+  %tmp17097 = getelementptr inbounds float* %tmp17096, i64 1
+  %tmp17098 = getelementptr inbounds float* %tmp17097, i64 1
+  %tmp17099 = getelementptr inbounds float* %tmp17098, i64 1
+  %tmp17100 = getelementptr inbounds float* %tmp17099, i64 1
+  %tmp17101 = getelementptr inbounds float* %tmp17100, i64 1
+  %tmp17102 = getelementptr inbounds float* %tmp17101, i64 1
+  %tmp17103 = getelementptr inbounds float* %tmp17102, i64 1
+  %tmp17104 = getelementptr inbounds float* %tmp17103, i64 1
+  %tmp17105 = getelementptr inbounds float* %tmp17104, i64 1
+  %tmp17106 = getelementptr inbounds float* %tmp17105, i64 1
+  %tmp17107 = getelementptr inbounds float* %tmp17106, i64 1
+  %tmp17108 = getelementptr inbounds float* %tmp17107, i64 1
+  %tmp17109 = getelementptr inbounds float* %tmp17108, i64 1
+  %tmp17110 = getelementptr inbounds float* %tmp17109, i64 1
+  %tmp17111 = getelementptr inbounds float* %tmp17110, i64 1
+  %tmp17112 = getelementptr inbounds float* %tmp17111, i64 1
+  %tmp17113 = getelementptr inbounds float* %tmp17112, i64 1
+  %tmp17114 = getelementptr inbounds float* %tmp17113, i64 1
+  %tmp17115 = getelementptr inbounds float* %tmp17114, i64 1
+  %tmp17116 = getelementptr inbounds float* %tmp17115, i64 1
+  %tmp17117 = getelementptr inbounds float* %tmp17116, i64 1
+  %tmp17118 = getelementptr inbounds float* %tmp17117, i64 1
+  %tmp17119 = getelementptr inbounds float* %tmp17118, i64 1
+  %tmp17120 = getelementptr inbounds float* %tmp17119, i64 1
+  %tmp17121 = getelementptr inbounds float* %tmp17120, i64 1
+  %tmp17122 = getelementptr inbounds float* %tmp17121, i64 1
+  %tmp17123 = getelementptr inbounds float* %tmp17122, i64 1
+  %tmp17124 = getelementptr inbounds float* %tmp17123, i64 1
+  %tmp17125 = getelementptr inbounds float* %tmp17124, i64 1
+  %tmp17126 = getelementptr inbounds float* %tmp17125, i64 1
+  %tmp17127 = getelementptr inbounds float* %tmp17126, i64 1
+  %tmp17128 = getelementptr inbounds float* %tmp17127, i64 1
+  %tmp17129 = getelementptr inbounds float* %tmp17128, i64 1
+  %tmp17130 = getelementptr inbounds float* %tmp17129, i64 1
+  %tmp17131 = getelementptr inbounds float* %tmp17130, i64 1
+  %tmp17132 = getelementptr inbounds float* %tmp17131, i64 1
+  %tmp17133 = getelementptr inbounds float* %tmp17132, i64 1
+  %tmp17134 = getelementptr inbounds float* %tmp17133, i64 1
+  %tmp17135 = getelementptr inbounds float* %tmp17134, i64 1
+  %tmp17136 = getelementptr inbounds float* %tmp17135, i64 1
+  %tmp17137 = getelementptr inbounds float* %tmp17136, i64 1
+  %tmp17138 = getelementptr inbounds float* %tmp17137, i64 1
+  %tmp17139 = getelementptr inbounds float* %tmp17138, i64 1
+  %tmp17140 = getelementptr inbounds float* %tmp17139, i64 1
+  %tmp17141 = getelementptr inbounds float* %tmp17140, i64 1
+  %tmp17142 = getelementptr inbounds float* %tmp17141, i64 1
+  %tmp17143 = getelementptr inbounds float* %tmp17142, i64 1
+  %tmp17144 = getelementptr inbounds float* %tmp17143, i64 1
+  %tmp17145 = getelementptr inbounds float* %tmp17144, i64 1
+  %tmp17146 = getelementptr inbounds float* %tmp17145, i64 1
+  %tmp17147 = getelementptr inbounds float* %tmp17146, i64 1
+  %tmp17148 = getelementptr inbounds float* %tmp17147, i64 1
+  %tmp17149 = getelementptr inbounds float* %tmp17148, i64 1
+  %tmp17150 = getelementptr inbounds float* %tmp17149, i64 1
+  %tmp17151 = getelementptr inbounds float* %tmp17150, i64 1
+  %tmp17152 = getelementptr inbounds float* %tmp17151, i64 1
+  %tmp17153 = getelementptr inbounds float* %tmp17152, i64 1
+  %tmp17154 = getelementptr inbounds float* %tmp17153, i64 1
+  %tmp17155 = getelementptr inbounds float* %tmp17154, i64 1
+  %tmp17156 = getelementptr inbounds float* %tmp17155, i64 1
+  %tmp17157 = getelementptr inbounds float* %tmp17156, i64 1
+  %tmp17158 = getelementptr inbounds float* %tmp17157, i64 1
+  %tmp17159 = getelementptr inbounds float* %tmp17158, i64 1
+  %tmp17160 = getelementptr inbounds float* %tmp17159, i64 1
+  %tmp17161 = getelementptr inbounds float* %tmp17160, i64 1
+  %tmp17162 = getelementptr inbounds float* %tmp17161, i64 1
+  %tmp17163 = getelementptr inbounds float* %tmp17162, i64 1
+  %tmp17164 = getelementptr inbounds float* %tmp17163, i64 1
+  %tmp17165 = getelementptr inbounds float* %tmp17164, i64 1
+  %tmp17166 = getelementptr inbounds float* %tmp17165, i64 1
+  %tmp17167 = getelementptr inbounds float* %tmp17166, i64 1
+  %tmp17168 = getelementptr inbounds float* %tmp17167, i64 1
+  %tmp17169 = getelementptr inbounds float* %tmp17168, i64 1
+  %tmp17170 = getelementptr inbounds float* %tmp17169, i64 1
+  %tmp17171 = getelementptr inbounds float* %tmp17170, i64 1
+  %tmp17172 = getelementptr inbounds float* %tmp17171, i64 1
+  %tmp17173 = getelementptr inbounds float* %tmp17172, i64 1
+  %tmp17174 = getelementptr inbounds float* %tmp17173, i64 1
+  %tmp17175 = getelementptr inbounds float* %tmp17174, i64 1
+  %tmp17176 = getelementptr inbounds float* %tmp17175, i64 1
+  %tmp17177 = getelementptr inbounds float* %tmp17176, i64 1
+  %tmp17178 = getelementptr inbounds float* %tmp17177, i64 1
+  %tmp17179 = getelementptr inbounds float* %tmp17178, i64 1
+  %tmp17180 = getelementptr inbounds float* %tmp17179, i64 1
+  %tmp17181 = getelementptr inbounds float* %tmp17180, i64 1
+  %tmp17182 = getelementptr inbounds float* %tmp17181, i64 1
+  %tmp17183 = getelementptr inbounds float* %tmp17182, i64 1
+  %tmp17184 = getelementptr inbounds float* %tmp17183, i64 1
+  %tmp17185 = getelementptr inbounds float* %tmp17184, i64 1
+  %tmp17186 = getelementptr inbounds float* %tmp17185, i64 1
+  %tmp17187 = getelementptr inbounds float* %tmp17186, i64 1
+  %tmp17188 = getelementptr inbounds float* %tmp17187, i64 1
+  %tmp17189 = getelementptr inbounds float* %tmp17188, i64 1
+  %tmp17190 = getelementptr inbounds float* %tmp17189, i64 1
+  %tmp17191 = getelementptr inbounds float* %tmp17190, i64 1
+  %tmp17192 = getelementptr inbounds float* %tmp17191, i64 1
+  %tmp17193 = getelementptr inbounds float* %tmp17192, i64 1
+  %tmp17194 = getelementptr inbounds float* %tmp17193, i64 1
+  %tmp17195 = getelementptr inbounds float* %tmp17194, i64 1
+  %tmp17196 = getelementptr inbounds float* %tmp17195, i64 1
+  %tmp17197 = getelementptr inbounds float* %tmp17196, i64 1
+  %tmp17198 = getelementptr inbounds float* %tmp17197, i64 1
+  %tmp17199 = getelementptr inbounds float* %tmp17198, i64 1
+  %tmp17200 = getelementptr inbounds float* %tmp17199, i64 1
+  %tmp17201 = getelementptr inbounds float* %tmp17200, i64 1
+  %tmp17202 = getelementptr inbounds float* %tmp17201, i64 1
+  %tmp17203 = getelementptr inbounds float* %tmp17202, i64 1
+  %tmp17204 = getelementptr inbounds float* %tmp17203, i64 1
+  %tmp17205 = getelementptr inbounds float* %tmp17204, i64 1
+  %tmp17206 = getelementptr inbounds float* %tmp17205, i64 1
+  %tmp17207 = getelementptr inbounds float* %tmp17206, i64 1
+  %tmp17208 = getelementptr inbounds float* %tmp17207, i64 1
+  %tmp17209 = getelementptr inbounds float* %tmp17208, i64 1
+  %tmp17210 = getelementptr inbounds float* %tmp17209, i64 1
+  %tmp17211 = getelementptr inbounds float* %tmp17210, i64 1
+  %tmp17212 = getelementptr inbounds float* %tmp17211, i64 1
+  %tmp17213 = getelementptr inbounds float* %tmp17212, i64 1
+  %tmp17214 = getelementptr inbounds float* %tmp17213, i64 1
+  %tmp17215 = getelementptr inbounds float* %tmp17214, i64 1
+  %tmp17216 = getelementptr inbounds float* %tmp17215, i64 1
+  %tmp17217 = getelementptr inbounds float* %tmp17216, i64 1
+  %tmp17218 = getelementptr inbounds float* %tmp17217, i64 1
+  %tmp17219 = getelementptr inbounds float* %tmp17218, i64 1
+  %tmp17220 = getelementptr inbounds float* %tmp17219, i64 1
+  %tmp17221 = getelementptr inbounds float* %tmp17220, i64 1
+  %tmp17222 = getelementptr inbounds float* %tmp17221, i64 1
+  %tmp17223 = getelementptr inbounds float* %tmp17222, i64 1
+  %tmp17224 = getelementptr inbounds float* %tmp17223, i64 1
+  %tmp17225 = getelementptr inbounds float* %tmp17224, i64 1
+  %tmp17226 = getelementptr inbounds float* %tmp17225, i64 1
+  %tmp17227 = getelementptr inbounds float* %tmp17226, i64 1
+  %tmp17228 = getelementptr inbounds float* %tmp17227, i64 1
+  %tmp17229 = getelementptr inbounds float* %tmp17228, i64 1
+  %tmp17230 = getelementptr inbounds float* %tmp17229, i64 1
+  %tmp17231 = getelementptr inbounds float* %tmp17230, i64 1
+  %tmp17232 = getelementptr inbounds float* %tmp17231, i64 1
+  %tmp17233 = getelementptr inbounds float* %tmp17232, i64 1
+  %tmp17234 = getelementptr inbounds float* %tmp17233, i64 1
+  %tmp17235 = getelementptr inbounds float* %tmp17234, i64 1
+  %tmp17236 = getelementptr inbounds float* %tmp17235, i64 1
+  %tmp17237 = getelementptr inbounds float* %tmp17236, i64 1
+  %tmp17238 = getelementptr inbounds float* %tmp17237, i64 1
+  %tmp17239 = getelementptr inbounds float* %tmp17238, i64 1
+  %tmp17240 = getelementptr inbounds float* %tmp17239, i64 1
+  %tmp17241 = getelementptr inbounds float* %tmp17240, i64 1
+  %tmp17242 = getelementptr inbounds float* %tmp17241, i64 1
+  %tmp17243 = getelementptr inbounds float* %tmp17242, i64 1
+  %tmp17244 = getelementptr inbounds float* %tmp17243, i64 1
+  %tmp17245 = getelementptr inbounds float* %tmp17244, i64 1
+  %tmp17246 = getelementptr inbounds float* %tmp17245, i64 1
+  %tmp17247 = getelementptr inbounds float* %tmp17246, i64 1
+  %tmp17248 = getelementptr inbounds float* %tmp17247, i64 1
+  %tmp17249 = getelementptr inbounds float* %tmp17248, i64 1
+  %tmp17250 = getelementptr inbounds float* %tmp17249, i64 1
+  %tmp17251 = getelementptr inbounds float* %tmp17250, i64 1
+  %tmp17252 = getelementptr inbounds float* %tmp17251, i64 1
+  %tmp17253 = getelementptr inbounds float* %tmp17252, i64 1
+  %tmp17254 = getelementptr inbounds float* %tmp17253, i64 1
+  %tmp17255 = getelementptr inbounds float* %tmp17254, i64 1
+  %tmp17256 = getelementptr inbounds float* %tmp17255, i64 1
+  %tmp17257 = getelementptr inbounds float* %tmp17256, i64 1
+  %tmp17258 = getelementptr inbounds float* %tmp17257, i64 1
+  %tmp17259 = getelementptr inbounds float* %tmp17258, i64 1
+  %tmp17260 = getelementptr inbounds float* %tmp17259, i64 1
+  %tmp17261 = getelementptr inbounds float* %tmp17260, i64 1
+  %tmp17262 = getelementptr inbounds float* %tmp17261, i64 1
+  %tmp17263 = getelementptr inbounds float* %tmp17262, i64 1
+  %tmp17264 = getelementptr inbounds float* %tmp17263, i64 1
+  %tmp17265 = getelementptr inbounds float* %tmp17264, i64 1
+  %tmp17266 = getelementptr inbounds float* %tmp17265, i64 1
+  %tmp17267 = getelementptr inbounds float* %tmp17266, i64 1
+  %tmp17268 = getelementptr inbounds float* %tmp17267, i64 1
+  %tmp17269 = getelementptr inbounds float* %tmp17268, i64 1
+  %tmp17270 = getelementptr inbounds float* %tmp17269, i64 1
+  %tmp17271 = getelementptr inbounds float* %tmp17270, i64 1
+  %tmp17272 = getelementptr inbounds float* %tmp17271, i64 1
+  %tmp17273 = getelementptr inbounds float* %tmp17272, i64 1
+  %tmp17274 = getelementptr inbounds float* %tmp17273, i64 1
+  %tmp17275 = getelementptr inbounds float* %tmp17274, i64 1
+  %tmp17276 = getelementptr inbounds float* %tmp17275, i64 1
+  %tmp17277 = getelementptr inbounds float* %tmp17276, i64 1
+  %tmp17278 = getelementptr inbounds float* %tmp17277, i64 1
+  %tmp17279 = getelementptr inbounds float* %tmp17278, i64 1
+  %tmp17280 = getelementptr inbounds float* %tmp17279, i64 1
+  %tmp17281 = getelementptr inbounds float* %tmp17280, i64 1
+  %tmp17282 = getelementptr inbounds float* %tmp17281, i64 1
+  %tmp17283 = getelementptr inbounds float* %tmp17282, i64 1
+  %tmp17284 = getelementptr inbounds float* %tmp17283, i64 1
+  %tmp17285 = getelementptr inbounds float* %tmp17284, i64 1
+  %tmp17286 = getelementptr inbounds float* %tmp17285, i64 1
+  %tmp17287 = getelementptr inbounds float* %tmp17286, i64 1
+  %tmp17288 = getelementptr inbounds float* %tmp17287, i64 1
+  %tmp17289 = getelementptr inbounds float* %tmp17288, i64 1
+  %tmp17290 = getelementptr inbounds float* %tmp17289, i64 1
+  %tmp17291 = getelementptr inbounds float* %tmp17290, i64 1
+  %tmp17292 = getelementptr inbounds float* %tmp17291, i64 1
+  %tmp17293 = getelementptr inbounds float* %tmp17292, i64 1
+  %tmp17294 = getelementptr inbounds float* %tmp17293, i64 1
+  %tmp17295 = getelementptr inbounds float* %tmp17294, i64 1
+  %tmp17296 = getelementptr inbounds float* %tmp17295, i64 1
+  %tmp17297 = getelementptr inbounds float* %tmp17296, i64 1
+  %tmp17298 = getelementptr inbounds float* %tmp17297, i64 1
+  %tmp17299 = getelementptr inbounds float* %tmp17298, i64 1
+  %tmp17300 = getelementptr inbounds float* %tmp17299, i64 1
+  %tmp17301 = getelementptr inbounds float* %tmp17300, i64 1
+  %tmp17302 = getelementptr inbounds float* %tmp17301, i64 1
+  %tmp17303 = getelementptr inbounds float* %tmp17302, i64 1
+  %tmp17304 = getelementptr inbounds float* %tmp17303, i64 1
+  %tmp17305 = getelementptr inbounds float* %tmp17304, i64 1
+  %tmp17306 = getelementptr inbounds float* %tmp17305, i64 1
+  %tmp17307 = getelementptr inbounds float* %tmp17306, i64 1
+  %tmp17308 = getelementptr inbounds float* %tmp17307, i64 1
+  %tmp17309 = getelementptr inbounds float* %tmp17308, i64 1
+  %tmp17310 = getelementptr inbounds float* %tmp17309, i64 1
+  %tmp17311 = getelementptr inbounds float* %tmp17310, i64 1
+  %tmp17312 = getelementptr inbounds float* %tmp17311, i64 1
+  %tmp17313 = getelementptr inbounds float* %tmp17312, i64 1
+  %tmp17314 = getelementptr inbounds float* %tmp17313, i64 1
+  %tmp17315 = getelementptr inbounds float* %tmp17314, i64 1
+  %tmp17316 = getelementptr inbounds float* %tmp17315, i64 1
+  %tmp17317 = getelementptr inbounds float* %tmp17316, i64 1
+  %tmp17318 = getelementptr inbounds float* %tmp17317, i64 1
+  %tmp17319 = getelementptr inbounds float* %tmp17318, i64 1
+  %tmp17320 = getelementptr inbounds float* %tmp17319, i64 1
+  %tmp17321 = getelementptr inbounds float* %tmp17320, i64 1
+  %tmp17322 = getelementptr inbounds float* %tmp17321, i64 1
+  %tmp17323 = getelementptr inbounds float* %tmp17322, i64 1
+  %tmp17324 = getelementptr inbounds float* %tmp17323, i64 1
+  %tmp17325 = getelementptr inbounds float* %tmp17324, i64 1
+  %tmp17326 = getelementptr inbounds float* %tmp17325, i64 1
+  %tmp17327 = getelementptr inbounds float* %tmp17326, i64 1
+  %tmp17328 = getelementptr inbounds float* %tmp17327, i64 1
+  %tmp17329 = getelementptr inbounds float* %tmp17328, i64 1
+  %tmp17330 = getelementptr inbounds float* %tmp17329, i64 1
+  %tmp17331 = getelementptr inbounds float* %tmp17330, i64 1
+  %tmp17332 = getelementptr inbounds float* %tmp17331, i64 1
+  %tmp17333 = getelementptr inbounds float* %tmp17332, i64 1
+  %tmp17334 = getelementptr inbounds float* %tmp17333, i64 1
+  %tmp17335 = getelementptr inbounds float* %tmp17334, i64 1
+  %tmp17336 = getelementptr inbounds float* %tmp17335, i64 1
+  %tmp17337 = getelementptr inbounds float* %tmp17336, i64 1
+  %tmp17338 = getelementptr inbounds float* %tmp17337, i64 1
+  %tmp17339 = getelementptr inbounds float* %tmp17338, i64 1
+  %tmp17340 = getelementptr inbounds float* %tmp17339, i64 1
+  %tmp17341 = getelementptr inbounds float* %tmp17340, i64 1
+  %tmp17342 = getelementptr inbounds float* %tmp17341, i64 1
+  %tmp17343 = getelementptr inbounds float* %tmp17342, i64 1
+  %tmp17344 = getelementptr inbounds float* %tmp17343, i64 1
+  %tmp17345 = getelementptr inbounds float* %tmp17344, i64 1
+  %tmp17346 = getelementptr inbounds float* %tmp17345, i64 1
+  %tmp17347 = getelementptr inbounds float* %tmp17346, i64 1
+  %tmp17348 = getelementptr inbounds float* %tmp17347, i64 1
+  %tmp17349 = getelementptr inbounds float* %tmp17348, i64 1
+  %tmp17350 = getelementptr inbounds float* %tmp17349, i64 1
+  %tmp17351 = getelementptr inbounds float* %tmp17350, i64 1
+  %tmp17352 = getelementptr inbounds float* %tmp17351, i64 1
+  %tmp17353 = getelementptr inbounds float* %tmp17352, i64 1
+  %tmp17354 = getelementptr inbounds float* %tmp17353, i64 1
+  %tmp17355 = getelementptr inbounds float* %tmp17354, i64 1
+  %tmp17356 = getelementptr inbounds float* %tmp17355, i64 1
+  %tmp17357 = getelementptr inbounds float* %tmp17356, i64 1
+  %tmp17358 = getelementptr inbounds float* %tmp17357, i64 1
+  %tmp17359 = getelementptr inbounds float* %tmp17358, i64 1
+  %tmp17360 = getelementptr inbounds float* %tmp17359, i64 1
+  %tmp17361 = getelementptr inbounds float* %tmp17360, i64 1
+  %tmp17362 = getelementptr inbounds float* %tmp17361, i64 1
+  %tmp17363 = getelementptr inbounds float* %tmp17362, i64 1
+  %tmp17364 = getelementptr inbounds float* %tmp17363, i64 1
+  %tmp17365 = getelementptr inbounds float* %tmp17364, i64 1
+  %tmp17366 = getelementptr inbounds float* %tmp17365, i64 1
+  %tmp17367 = getelementptr inbounds float* %tmp17366, i64 1
+  %tmp17368 = getelementptr inbounds float* %tmp17367, i64 1
+  %tmp17369 = getelementptr inbounds float* %tmp17368, i64 1
+  %tmp17370 = getelementptr inbounds float* %tmp17369, i64 1
+  %tmp17371 = getelementptr inbounds float* %tmp17370, i64 1
+  %tmp17372 = getelementptr inbounds float* %tmp17371, i64 1
+  %tmp17373 = getelementptr inbounds float* %tmp17372, i64 1
+  %tmp17374 = getelementptr inbounds float* %tmp17373, i64 1
+  %tmp17375 = getelementptr inbounds float* %tmp17374, i64 1
+  %tmp17376 = getelementptr inbounds float* %tmp17375, i64 1
+  %tmp17377 = getelementptr inbounds float* %tmp17376, i64 1
+  %tmp17378 = getelementptr inbounds float* %tmp17377, i64 1
+  %tmp17379 = getelementptr inbounds float* %tmp17378, i64 1
+  %tmp17380 = getelementptr inbounds float* %tmp17379, i64 1
+  %tmp17381 = getelementptr inbounds float* %tmp17380, i64 1
+  %tmp17382 = getelementptr inbounds float* %tmp17381, i64 1
+  %tmp17383 = getelementptr inbounds float* %tmp17382, i64 1
+  %tmp17384 = getelementptr inbounds float* %tmp17383, i64 1
+  %tmp17385 = getelementptr inbounds float* %tmp17384, i64 1
+  %tmp17386 = getelementptr inbounds float* %tmp17385, i64 1
+  %tmp17387 = getelementptr inbounds float* %tmp17386, i64 1
+  %tmp17388 = getelementptr inbounds float* %tmp17387, i64 1
+  %tmp17389 = getelementptr inbounds float* %tmp17388, i64 1
+  %tmp17390 = getelementptr inbounds float* %tmp17389, i64 1
+  %tmp17391 = getelementptr inbounds float* %tmp17390, i64 1
+  %tmp17392 = getelementptr inbounds float* %tmp17391, i64 1
+  %tmp17393 = getelementptr inbounds float* %tmp17392, i64 1
+  %tmp17394 = getelementptr inbounds float* %tmp17393, i64 1
+  %tmp17395 = getelementptr inbounds float* %tmp17394, i64 1
+  %tmp17396 = getelementptr inbounds float* %tmp17395, i64 1
+  %tmp17397 = getelementptr inbounds float* %tmp17396, i64 1
+  %tmp17398 = getelementptr inbounds float* %tmp17397, i64 1
+  %tmp17399 = getelementptr inbounds float* %tmp17398, i64 1
+  %tmp17400 = getelementptr inbounds float* %tmp17399, i64 1
+  %tmp17401 = getelementptr inbounds float* %tmp17400, i64 1
+  %tmp17402 = getelementptr inbounds float* %tmp17401, i64 1
+  %tmp17403 = getelementptr inbounds float* %tmp17402, i64 1
+  %tmp17404 = getelementptr inbounds float* %tmp17403, i64 1
+  %tmp17405 = getelementptr inbounds float* %tmp17404, i64 1
+  %tmp17406 = getelementptr inbounds float* %tmp17405, i64 1
+  %tmp17407 = getelementptr inbounds float* %tmp17406, i64 1
+  %tmp17408 = getelementptr inbounds float* %tmp17407, i64 1
+  %tmp17409 = getelementptr inbounds float* %tmp17408, i64 1
+  %tmp17410 = getelementptr inbounds float* %tmp17409, i64 1
+  %tmp17411 = getelementptr inbounds float* %tmp17410, i64 1
+  %tmp17412 = getelementptr inbounds float* %tmp17411, i64 1
+  %tmp17413 = getelementptr inbounds float* %tmp17412, i64 1
+  %tmp17414 = getelementptr inbounds float* %tmp17413, i64 1
+  %tmp17415 = getelementptr inbounds float* %tmp17414, i64 1
+  %tmp17416 = getelementptr inbounds float* %tmp17415, i64 1
+  %tmp17417 = getelementptr inbounds float* %tmp17416, i64 1
+  %tmp17418 = getelementptr inbounds float* %tmp17417, i64 1
+  %tmp17419 = getelementptr inbounds float* %tmp17418, i64 1
+  %tmp17420 = getelementptr inbounds float* %tmp17419, i64 1
+  %tmp17421 = getelementptr inbounds float* %tmp17420, i64 1
+  %tmp17422 = getelementptr inbounds float* %tmp17421, i64 1
+  %tmp17423 = getelementptr inbounds float* %tmp17422, i64 1
+  %tmp17424 = getelementptr inbounds float* %tmp17423, i64 1
+  %tmp17425 = getelementptr inbounds float* %tmp17424, i64 1
+  %tmp17426 = getelementptr inbounds float* %tmp17425, i64 1
+  %tmp17427 = getelementptr inbounds float* %tmp17426, i64 1
+  %tmp17428 = getelementptr inbounds float* %tmp17427, i64 1
+  %tmp17429 = getelementptr inbounds float* %tmp17428, i64 1
+  %tmp17430 = getelementptr inbounds float* %tmp17429, i64 1
+  %tmp17431 = getelementptr inbounds float* %tmp17430, i64 1
+  %tmp17432 = getelementptr inbounds float* %tmp17431, i64 1
+  %tmp17433 = getelementptr inbounds float* %tmp17432, i64 1
+  %tmp17434 = getelementptr inbounds float* %tmp17433, i64 1
+  %tmp17435 = getelementptr inbounds float* %tmp17434, i64 1
+  %tmp17436 = getelementptr inbounds float* %tmp17435, i64 1
+  %tmp17437 = getelementptr inbounds float* %tmp17436, i64 1
+  %tmp17438 = getelementptr inbounds float* %tmp17437, i64 1
+  %tmp17439 = getelementptr inbounds float* %tmp17438, i64 1
+  %tmp17440 = getelementptr inbounds float* %tmp17439, i64 1
+  %tmp17441 = getelementptr inbounds float* %tmp17440, i64 1
+  %tmp17442 = getelementptr inbounds float* %tmp17441, i64 1
+  %tmp17443 = getelementptr inbounds float* %tmp17442, i64 1
+  %tmp17444 = getelementptr inbounds float* %tmp17443, i64 1
+  %tmp17445 = getelementptr inbounds float* %tmp17444, i64 1
+  %tmp17446 = getelementptr inbounds float* %tmp17445, i64 1
+  %tmp17447 = getelementptr inbounds float* %tmp17446, i64 1
+  %tmp17448 = getelementptr inbounds float* %tmp17447, i64 1
+  %tmp17449 = getelementptr inbounds float* %tmp17448, i64 1
+  %tmp17450 = getelementptr inbounds float* %tmp17449, i64 1
+  %tmp17451 = getelementptr inbounds float* %tmp17450, i64 1
+  %tmp17452 = getelementptr inbounds float* %tmp17451, i64 1
+  %tmp17453 = getelementptr inbounds float* %tmp17452, i64 1
+  %tmp17454 = getelementptr inbounds float* %tmp17453, i64 1
+  %tmp17455 = getelementptr inbounds float* %tmp17454, i64 1
+  %tmp17456 = getelementptr inbounds float* %tmp17455, i64 1
+  %tmp17457 = getelementptr inbounds float* %tmp17456, i64 1
+  %tmp17458 = getelementptr inbounds float* %tmp17457, i64 1
+  %tmp17459 = getelementptr inbounds float* %tmp17458, i64 1
+  %tmp17460 = getelementptr inbounds float* %tmp17459, i64 1
+  %tmp17461 = getelementptr inbounds float* %tmp17460, i64 1
+  %tmp17462 = getelementptr inbounds float* %tmp17461, i64 1
+  %tmp17463 = getelementptr inbounds float* %tmp17462, i64 1
+  %tmp17464 = getelementptr inbounds float* %tmp17463, i64 1
+  %tmp17465 = getelementptr inbounds float* %tmp17464, i64 1
+  %tmp17466 = getelementptr inbounds float* %tmp17465, i64 1
+  %tmp17467 = getelementptr inbounds float* %tmp17466, i64 1
+  %tmp17468 = getelementptr inbounds float* %tmp17467, i64 1
+  %tmp17469 = getelementptr inbounds float* %tmp17468, i64 1
+  %tmp17470 = getelementptr inbounds float* %tmp17469, i64 1
+  %tmp17471 = getelementptr inbounds float* %tmp17470, i64 1
+  %tmp17472 = getelementptr inbounds float* %tmp17471, i64 1
+  %tmp17473 = getelementptr inbounds float* %tmp17472, i64 1
+  %tmp17474 = getelementptr inbounds float* %tmp17473, i64 1
+  %tmp17475 = getelementptr inbounds float* %tmp17474, i64 1
+  %tmp17476 = getelementptr inbounds float* %tmp17475, i64 1
+  %tmp17477 = getelementptr inbounds float* %tmp17476, i64 1
+  %tmp17478 = getelementptr inbounds float* %tmp17477, i64 1
+  %tmp17479 = getelementptr inbounds float* %tmp17478, i64 1
+  %tmp17480 = getelementptr inbounds float* %tmp17479, i64 1
+  %tmp17481 = getelementptr inbounds float* %tmp17480, i64 1
+  %tmp17482 = getelementptr inbounds float* %tmp17481, i64 1
+  %tmp17483 = getelementptr inbounds float* %tmp17482, i64 1
+  %tmp17484 = getelementptr inbounds float* %tmp17483, i64 1
+  %tmp17485 = getelementptr inbounds float* %tmp17484, i64 1
+  %tmp17486 = getelementptr inbounds float* %tmp17485, i64 1
+  %tmp17487 = getelementptr inbounds float* %tmp17486, i64 1
+  %tmp17488 = getelementptr inbounds float* %tmp17487, i64 1
+  %tmp17489 = getelementptr inbounds float* %tmp17488, i64 1
+  %tmp17490 = getelementptr inbounds float* %tmp17489, i64 1
+  %tmp17491 = getelementptr inbounds float* %tmp17490, i64 1
+  %tmp17492 = getelementptr inbounds float* %tmp17491, i64 1
+  %tmp17493 = getelementptr inbounds float* %tmp17492, i64 1
+  %tmp17494 = getelementptr inbounds float* %tmp17493, i64 1
+  %tmp17495 = getelementptr inbounds float* %tmp17494, i64 1
+  %tmp17496 = getelementptr inbounds float* %tmp17495, i64 1
+  %tmp17497 = getelementptr inbounds float* %tmp17496, i64 1
+  %tmp17498 = getelementptr inbounds float* %tmp17497, i64 1
+  %tmp17499 = getelementptr inbounds float* %tmp17498, i64 1
+  %tmp17500 = getelementptr inbounds float* %tmp17499, i64 1
+  %tmp17501 = getelementptr inbounds float* %tmp17500, i64 1
+  %tmp17502 = getelementptr inbounds float* %tmp17501, i64 1
+  %tmp17503 = getelementptr inbounds float* %tmp17502, i64 1
+  %tmp17504 = getelementptr inbounds float* %tmp17503, i64 1
+  %tmp17505 = getelementptr inbounds float* %tmp17504, i64 1
+  %tmp17506 = getelementptr inbounds float* %tmp17505, i64 1
+  %tmp17507 = getelementptr inbounds float* %tmp17506, i64 1
+  %tmp17508 = getelementptr inbounds float* %tmp17507, i64 1
+  %tmp17509 = getelementptr inbounds float* %tmp17508, i64 1
+  %tmp17510 = getelementptr inbounds float* %tmp17509, i64 1
+  %tmp17511 = getelementptr inbounds float* %tmp17510, i64 1
+  %tmp17512 = getelementptr inbounds float* %tmp17511, i64 1
+  %tmp17513 = getelementptr inbounds float* %tmp17512, i64 1
+  %tmp17514 = getelementptr inbounds float* %tmp17513, i64 1
+  %tmp17515 = getelementptr inbounds float* %tmp17514, i64 1
+  %tmp17516 = getelementptr inbounds float* %tmp17515, i64 1
+  %tmp17517 = getelementptr inbounds float* %tmp17516, i64 1
+  %tmp17518 = getelementptr inbounds float* %tmp17517, i64 1
+  %tmp17519 = getelementptr inbounds float* %tmp17518, i64 1
+  %tmp17520 = getelementptr inbounds float* %tmp17519, i64 1
+  %tmp17521 = getelementptr inbounds float* %tmp17520, i64 1
+  %tmp17522 = getelementptr inbounds float* %tmp17521, i64 1
+  %tmp17523 = getelementptr inbounds float* %tmp17522, i64 1
+  %tmp17524 = getelementptr inbounds float* %tmp17523, i64 1
+  %tmp17525 = getelementptr inbounds float* %tmp17524, i64 1
+  %tmp17526 = getelementptr inbounds float* %tmp17525, i64 1
+  %tmp17527 = getelementptr inbounds float* %tmp17526, i64 1
+  %tmp17528 = getelementptr inbounds float* %tmp17527, i64 1
+  %tmp17529 = getelementptr inbounds float* %tmp17528, i64 1
+  %tmp17530 = getelementptr inbounds float* %tmp17529, i64 1
+  %tmp17531 = getelementptr inbounds float* %tmp17530, i64 1
+  %tmp17532 = getelementptr inbounds float* %tmp17531, i64 1
+  %tmp17533 = getelementptr inbounds float* %tmp17532, i64 1
+  %tmp17534 = getelementptr inbounds float* %tmp17533, i64 1
+  %tmp17535 = getelementptr inbounds float* %tmp17534, i64 1
+  %tmp17536 = getelementptr inbounds float* %tmp17535, i64 1
+  %tmp17537 = getelementptr inbounds float* %tmp17536, i64 1
+  %tmp17538 = getelementptr inbounds float* %tmp17537, i64 1
+  %tmp17539 = getelementptr inbounds float* %tmp17538, i64 1
+  %tmp17540 = getelementptr inbounds float* %tmp17539, i64 1
+  %tmp17541 = getelementptr inbounds float* %tmp17540, i64 1
+  %tmp17542 = getelementptr inbounds float* %tmp17541, i64 1
+  %tmp17543 = getelementptr inbounds float* %tmp17542, i64 1
+  %tmp17544 = getelementptr inbounds float* %tmp17543, i64 1
+  %tmp17545 = getelementptr inbounds float* %tmp17544, i64 1
+  %tmp17546 = getelementptr inbounds float* %tmp17545, i64 1
+  %tmp17547 = getelementptr inbounds float* %tmp17546, i64 1
+  %tmp17548 = getelementptr inbounds float* %tmp17547, i64 1
+  %tmp17549 = getelementptr inbounds float* %tmp17548, i64 1
+  %tmp17550 = getelementptr inbounds float* %tmp17549, i64 1
+  %tmp17551 = getelementptr inbounds float* %tmp17550, i64 1
+  %tmp17552 = getelementptr inbounds float* %tmp17551, i64 1
+  %tmp17553 = getelementptr inbounds float* %tmp17552, i64 1
+  %tmp17554 = getelementptr inbounds float* %tmp17553, i64 1
+  %tmp17555 = getelementptr inbounds float* %tmp17554, i64 1
+  %tmp17556 = getelementptr inbounds float* %tmp17555, i64 1
+  %tmp17557 = getelementptr inbounds float* %tmp17556, i64 1
+  %tmp17558 = getelementptr inbounds float* %tmp17557, i64 1
+  %tmp17559 = getelementptr inbounds float* %tmp17558, i64 1
+  %tmp17560 = getelementptr inbounds float* %tmp17559, i64 1
+  %tmp17561 = getelementptr inbounds float* %tmp17560, i64 1
+  %tmp17562 = getelementptr inbounds float* %tmp17561, i64 1
+  %tmp17563 = getelementptr inbounds float* %tmp17562, i64 1
+  %tmp17564 = getelementptr inbounds float* %tmp17563, i64 1
+  %tmp17565 = getelementptr inbounds float* %tmp17564, i64 1
+  %tmp17566 = getelementptr inbounds float* %tmp17565, i64 1
+  %tmp17567 = getelementptr inbounds float* %tmp17566, i64 1
+  %tmp17568 = getelementptr inbounds float* %tmp17567, i64 1
+  %tmp17569 = getelementptr inbounds float* %tmp17568, i64 1
+  %tmp17570 = getelementptr inbounds float* %tmp17569, i64 1
+  %tmp17571 = getelementptr inbounds float* %tmp17570, i64 1
+  %tmp17572 = getelementptr inbounds float* %tmp17571, i64 1
+  %tmp17573 = getelementptr inbounds float* %tmp17572, i64 1
+  %tmp17574 = getelementptr inbounds float* %tmp17573, i64 1
+  %tmp17575 = getelementptr inbounds float* %tmp17574, i64 1
+  %tmp17576 = getelementptr inbounds float* %tmp17575, i64 1
+  %tmp17577 = getelementptr inbounds float* %tmp17576, i64 1
+  %tmp17578 = getelementptr inbounds float* %tmp17577, i64 1
+  %tmp17579 = getelementptr inbounds float* %tmp17578, i64 1
+  %tmp17580 = getelementptr inbounds float* %tmp17579, i64 1
+  %tmp17581 = getelementptr inbounds float* %tmp17580, i64 1
+  %tmp17582 = getelementptr inbounds float* %tmp17581, i64 1
+  %tmp17583 = getelementptr inbounds float* %tmp17582, i64 1
+  %tmp17584 = getelementptr inbounds float* %tmp17583, i64 1
+  %tmp17585 = getelementptr inbounds float* %tmp17584, i64 1
+  %tmp17586 = getelementptr inbounds float* %tmp17585, i64 1
+  %tmp17587 = getelementptr inbounds float* %tmp17586, i64 1
+  %tmp17588 = getelementptr inbounds float* %tmp17587, i64 1
+  %tmp17589 = getelementptr inbounds float* %tmp17588, i64 1
+  %tmp17590 = getelementptr inbounds float* %tmp17589, i64 1
+  %tmp17591 = getelementptr inbounds float* %tmp17590, i64 1
+  %tmp17592 = getelementptr inbounds float* %tmp17591, i64 1
+  %tmp17593 = getelementptr inbounds float* %tmp17592, i64 1
+  %tmp17594 = getelementptr inbounds float* %tmp17593, i64 1
+  %tmp17595 = getelementptr inbounds float* %tmp17594, i64 1
+  %tmp17596 = getelementptr inbounds float* %tmp17595, i64 1
+  %tmp17597 = getelementptr inbounds float* %tmp17596, i64 1
+  %tmp17598 = getelementptr inbounds float* %tmp17597, i64 1
+  %tmp17599 = getelementptr inbounds float* %tmp17598, i64 1
+  %tmp17600 = getelementptr inbounds float* %tmp17599, i64 1
+  %tmp17601 = getelementptr inbounds float* %tmp17600, i64 1
+  %tmp17602 = getelementptr inbounds float* %tmp17601, i64 1
+  %tmp17603 = getelementptr inbounds float* %tmp17602, i64 1
+  %tmp17604 = getelementptr inbounds float* %tmp17603, i64 1
+  %tmp17605 = getelementptr inbounds float* %tmp17604, i64 1
+  %tmp17606 = getelementptr inbounds float* %tmp17605, i64 1
+  %tmp17607 = getelementptr inbounds float* %tmp17606, i64 1
+  %tmp17608 = getelementptr inbounds float* %tmp17607, i64 1
+  %tmp17609 = getelementptr inbounds float* %tmp17608, i64 1
+  %tmp17610 = getelementptr inbounds float* %tmp17609, i64 1
+  %tmp17611 = getelementptr inbounds float* %tmp17610, i64 1
+  %tmp17612 = getelementptr inbounds float* %tmp17611, i64 1
+  %tmp17613 = getelementptr inbounds float* %tmp17612, i64 1
+  %tmp17614 = getelementptr inbounds float* %tmp17613, i64 1
+  %tmp17615 = getelementptr inbounds float* %tmp17614, i64 1
+  %tmp17616 = getelementptr inbounds float* %tmp17615, i64 1
+  %tmp17617 = getelementptr inbounds float* %tmp17616, i64 1
+  %tmp17618 = getelementptr inbounds float* %tmp17617, i64 1
+  %tmp17619 = getelementptr inbounds float* %tmp17618, i64 1
+  %tmp17620 = getelementptr inbounds float* %tmp17619, i64 1
+  %tmp17621 = getelementptr inbounds float* %tmp17620, i64 1
+  %tmp17622 = getelementptr inbounds float* %tmp17621, i64 1
+  %tmp17623 = getelementptr inbounds float* %tmp17622, i64 1
+  %tmp17624 = getelementptr inbounds float* %tmp17623, i64 1
+  %tmp17625 = getelementptr inbounds float* %tmp17624, i64 1
+  %tmp17626 = getelementptr inbounds float* %tmp17625, i64 1
+  %tmp17627 = getelementptr inbounds float* %tmp17626, i64 1
+  %tmp17628 = getelementptr inbounds float* %tmp17627, i64 1
+  %tmp17629 = getelementptr inbounds float* %tmp17628, i64 1
+  %tmp17630 = getelementptr inbounds float* %tmp17629, i64 1
+  %tmp17631 = getelementptr inbounds float* %tmp17630, i64 1
+  %tmp17632 = getelementptr inbounds float* %tmp17631, i64 1
+  %tmp17633 = getelementptr inbounds float* %tmp17632, i64 1
+  %tmp17634 = getelementptr inbounds float* %tmp17633, i64 1
+  %tmp17635 = getelementptr inbounds float* %tmp17634, i64 1
+  %tmp17636 = getelementptr inbounds float* %tmp17635, i64 1
+  %tmp17637 = getelementptr inbounds float* %tmp17636, i64 1
+  %tmp17638 = getelementptr inbounds float* %tmp17637, i64 1
+  %tmp17639 = getelementptr inbounds float* %tmp17638, i64 1
+  %tmp17640 = getelementptr inbounds float* %tmp17639, i64 1
+  %tmp17641 = getelementptr inbounds float* %tmp17640, i64 1
+  %tmp17642 = getelementptr inbounds float* %tmp17641, i64 1
+  %tmp17643 = getelementptr inbounds float* %tmp17642, i64 1
+  %tmp17644 = getelementptr inbounds float* %tmp17643, i64 1
+  %tmp17645 = getelementptr inbounds float* %tmp17644, i64 1
+  %tmp17646 = getelementptr inbounds float* %tmp17645, i64 1
+  %tmp17647 = getelementptr inbounds float* %tmp17646, i64 1
+  %tmp17648 = getelementptr inbounds float* %tmp17647, i64 1
+  %tmp17649 = getelementptr inbounds float* %tmp17648, i64 1
+  %tmp17650 = getelementptr inbounds float* %tmp17649, i64 1
+  %tmp17651 = getelementptr inbounds float* %tmp17650, i64 1
+  %tmp17652 = getelementptr inbounds float* %tmp17651, i64 1
+  %tmp17653 = getelementptr inbounds float* %tmp17652, i64 1
+  %tmp17654 = getelementptr inbounds float* %tmp17653, i64 1
+  %tmp17655 = getelementptr inbounds float* %tmp17654, i64 1
+  %tmp17656 = getelementptr inbounds float* %tmp17655, i64 1
+  %tmp17657 = getelementptr inbounds float* %tmp17656, i64 1
+  %tmp17658 = getelementptr inbounds float* %tmp17657, i64 1
+  %tmp17659 = getelementptr inbounds float* %tmp17658, i64 1
+  %tmp17660 = getelementptr inbounds float* %tmp17659, i64 1
+  %tmp17661 = getelementptr inbounds float* %tmp17660, i64 1
+  %tmp17662 = getelementptr inbounds float* %tmp17661, i64 1
+  %tmp17663 = getelementptr inbounds float* %tmp17662, i64 1
+  %tmp17664 = getelementptr inbounds float* %tmp17663, i64 1
+  %tmp17665 = getelementptr inbounds float* %tmp17664, i64 1
+  %tmp17666 = getelementptr inbounds float* %tmp17665, i64 1
+  %tmp17667 = getelementptr inbounds float* %tmp17666, i64 1
+  %tmp17668 = getelementptr inbounds float* %tmp17667, i64 1
+  %tmp17669 = getelementptr inbounds float* %tmp17668, i64 1
+  %tmp17670 = getelementptr inbounds float* %tmp17669, i64 1
+  %tmp17671 = getelementptr inbounds float* %tmp17670, i64 1
+  %tmp17672 = getelementptr inbounds float* %tmp17671, i64 1
+  %tmp17673 = getelementptr inbounds float* %tmp17672, i64 1
+  %tmp17674 = getelementptr inbounds float* %tmp17673, i64 1
+  %tmp17675 = getelementptr inbounds float* %tmp17674, i64 1
+  %tmp17676 = getelementptr inbounds float* %tmp17675, i64 1
+  %tmp17677 = getelementptr inbounds float* %tmp17676, i64 1
+  %tmp17678 = getelementptr inbounds float* %tmp17677, i64 1
+  %tmp17679 = getelementptr inbounds float* %tmp17678, i64 1
+  %tmp17680 = getelementptr inbounds float* %tmp17679, i64 1
+  %tmp17681 = getelementptr inbounds float* %tmp17680, i64 1
+  %tmp17682 = getelementptr inbounds float* %tmp17681, i64 1
+  %tmp17683 = getelementptr inbounds float* %tmp17682, i64 1
+  %tmp17684 = getelementptr inbounds float* %tmp17683, i64 1
+  %tmp17685 = getelementptr inbounds float* %tmp17684, i64 1
+  %tmp17686 = getelementptr inbounds float* %tmp17685, i64 1
+  %tmp17687 = getelementptr inbounds float* %tmp17686, i64 1
+  %tmp17688 = getelementptr inbounds float* %tmp17687, i64 1
+  %tmp17689 = getelementptr inbounds float* %tmp17688, i64 1
+  %tmp17690 = getelementptr inbounds float* %tmp17689, i64 1
+  %tmp17691 = getelementptr inbounds float* %tmp17690, i64 1
+  %tmp17692 = getelementptr inbounds float* %tmp17691, i64 1
+  %tmp17693 = getelementptr inbounds float* %tmp17692, i64 1
+  %tmp17694 = getelementptr inbounds float* %tmp17693, i64 1
+  %tmp17695 = getelementptr inbounds float* %tmp17694, i64 1
+  %tmp17696 = getelementptr inbounds float* %tmp17695, i64 1
+  %tmp17697 = getelementptr inbounds float* %tmp17696, i64 1
+  %tmp17698 = getelementptr inbounds float* %tmp17697, i64 1
+  %tmp17699 = getelementptr inbounds float* %tmp17698, i64 1
+  %tmp17700 = getelementptr inbounds float* %tmp17699, i64 1
+  %tmp17701 = getelementptr inbounds float* %tmp17700, i64 1
+  %tmp17702 = getelementptr inbounds float* %tmp17701, i64 1
+  %tmp17703 = getelementptr inbounds float* %tmp17702, i64 1
+  %tmp17704 = getelementptr inbounds float* %tmp17703, i64 1
+  %tmp17705 = getelementptr inbounds float* %tmp17704, i64 1
+  %tmp17706 = getelementptr inbounds float* %tmp17705, i64 1
+  %tmp17707 = getelementptr inbounds float* %tmp17706, i64 1
+  %tmp17708 = getelementptr inbounds float* %tmp17707, i64 1
+  %tmp17709 = getelementptr inbounds float* %tmp17708, i64 1
+  %tmp17710 = getelementptr inbounds float* %tmp17709, i64 1
+  %tmp17711 = getelementptr inbounds float* %tmp17710, i64 1
+  %tmp17712 = getelementptr inbounds float* %tmp17711, i64 1
+  %tmp17713 = getelementptr inbounds float* %tmp17712, i64 1
+  %tmp17714 = getelementptr inbounds float* %tmp17713, i64 1
+  %tmp17715 = getelementptr inbounds float* %tmp17714, i64 1
+  %tmp17716 = getelementptr inbounds float* %tmp17715, i64 1
+  %tmp17717 = getelementptr inbounds float* %tmp17716, i64 1
+  %tmp17718 = getelementptr inbounds float* %tmp17717, i64 1
+  %tmp17719 = getelementptr inbounds float* %tmp17718, i64 1
+  %tmp17720 = getelementptr inbounds float* %tmp17719, i64 1
+  %tmp17721 = getelementptr inbounds float* %tmp17720, i64 1
+  %tmp17722 = getelementptr inbounds float* %tmp17721, i64 1
+  %tmp17723 = getelementptr inbounds float* %tmp17722, i64 1
+  %tmp17724 = getelementptr inbounds float* %tmp17723, i64 1
+  %tmp17725 = getelementptr inbounds float* %tmp17724, i64 1
+  %tmp17726 = getelementptr inbounds float* %tmp17725, i64 1
+  %tmp17727 = getelementptr inbounds float* %tmp17726, i64 1
+  %tmp17728 = getelementptr inbounds float* %tmp17727, i64 1
+  %tmp17729 = getelementptr inbounds float* %tmp17728, i64 1
+  %tmp17730 = getelementptr inbounds float* %tmp17729, i64 1
+  %tmp17731 = getelementptr inbounds float* %tmp17730, i64 1
+  %tmp17732 = getelementptr inbounds float* %tmp17731, i64 1
+  %tmp17733 = getelementptr inbounds float* %tmp17732, i64 1
+  %tmp17734 = getelementptr inbounds float* %tmp17733, i64 1
+  %tmp17735 = getelementptr inbounds float* %tmp17734, i64 1
+  %tmp17736 = getelementptr inbounds float* %tmp17735, i64 1
+  %tmp17737 = getelementptr inbounds float* %tmp17736, i64 1
+  %tmp17738 = getelementptr inbounds float* %tmp17737, i64 1
+  %tmp17739 = getelementptr inbounds float* %tmp17738, i64 1
+  %tmp17740 = getelementptr inbounds float* %tmp17739, i64 1
+  %tmp17741 = getelementptr inbounds float* %tmp17740, i64 1
+  %tmp17742 = getelementptr inbounds float* %tmp17741, i64 1
+  %tmp17743 = getelementptr inbounds float* %tmp17742, i64 1
+  %tmp17744 = getelementptr inbounds float* %tmp17743, i64 1
+  %tmp17745 = getelementptr inbounds float* %tmp17744, i64 1
+  %tmp17746 = getelementptr inbounds float* %tmp17745, i64 1
+  %tmp17747 = getelementptr inbounds float* %tmp17746, i64 1
+  %tmp17748 = getelementptr inbounds float* %tmp17747, i64 1
+  %tmp17749 = getelementptr inbounds float* %tmp17748, i64 1
+  %tmp17750 = getelementptr inbounds float* %tmp17749, i64 1
+  %tmp17751 = getelementptr inbounds float* %tmp17750, i64 1
+  %tmp17752 = getelementptr inbounds float* %tmp17751, i64 1
+  %tmp17753 = getelementptr inbounds float* %tmp17752, i64 1
+  %tmp17754 = getelementptr inbounds float* %tmp17753, i64 1
+  %tmp17755 = getelementptr inbounds float* %tmp17754, i64 1
+  %tmp17756 = getelementptr inbounds float* %tmp17755, i64 1
+  %tmp17757 = getelementptr inbounds float* %tmp17756, i64 1
+  %tmp17758 = getelementptr inbounds float* %tmp17757, i64 1
+  %tmp17759 = getelementptr inbounds float* %tmp17758, i64 1
+  %tmp17760 = getelementptr inbounds float* %tmp17759, i64 1
+  %tmp17761 = getelementptr inbounds float* %tmp17760, i64 1
+  %tmp17762 = getelementptr inbounds float* %tmp17761, i64 1
+  %tmp17763 = getelementptr inbounds float* %tmp17762, i64 1
+  %tmp17764 = getelementptr inbounds float* %tmp17763, i64 1
+  %tmp17765 = getelementptr inbounds float* %tmp17764, i64 1
+  %tmp17766 = getelementptr inbounds float* %tmp17765, i64 1
+  %tmp17767 = getelementptr inbounds float* %tmp17766, i64 1
+  %tmp17768 = getelementptr inbounds float* %tmp17767, i64 1
+  %tmp17769 = getelementptr inbounds float* %tmp17768, i64 1
+  %tmp17770 = getelementptr inbounds float* %tmp17769, i64 1
+  %tmp17771 = getelementptr inbounds float* %tmp17770, i64 1
+  %tmp17772 = getelementptr inbounds float* %tmp17771, i64 1
+  %tmp17773 = getelementptr inbounds float* %tmp17772, i64 1
+  %tmp17774 = getelementptr inbounds float* %tmp17773, i64 1
+  %tmp17775 = getelementptr inbounds float* %tmp17774, i64 1
+  %tmp17776 = getelementptr inbounds float* %tmp17775, i64 1
+  %tmp17777 = getelementptr inbounds float* %tmp17776, i64 1
+  %tmp17778 = getelementptr inbounds float* %tmp17777, i64 1
+  %tmp17779 = getelementptr inbounds float* %tmp17778, i64 1
+  %tmp17780 = getelementptr inbounds float* %tmp17779, i64 1
+  %tmp17781 = getelementptr inbounds float* %tmp17780, i64 1
+  %tmp17782 = getelementptr inbounds float* %tmp17781, i64 1
+  %tmp17783 = getelementptr inbounds float* %tmp17782, i64 1
+  %tmp17784 = getelementptr inbounds float* %tmp17783, i64 1
+  %tmp17785 = getelementptr inbounds float* %tmp17784, i64 1
+  %tmp17786 = getelementptr inbounds float* %tmp17785, i64 1
+  %tmp17787 = getelementptr inbounds float* %tmp17786, i64 1
+  %tmp17788 = getelementptr inbounds float* %tmp17787, i64 1
+  %tmp17789 = getelementptr inbounds float* %tmp17788, i64 1
+  %tmp17790 = getelementptr inbounds float* %tmp17789, i64 1
+  %tmp17791 = getelementptr inbounds float* %tmp17790, i64 1
+  %tmp17792 = getelementptr inbounds float* %tmp17791, i64 1
+  %tmp17793 = getelementptr inbounds float* %tmp17792, i64 1
+  %tmp17794 = getelementptr inbounds float* %tmp17793, i64 1
+  %tmp17795 = getelementptr inbounds float* %tmp17794, i64 1
+  %tmp17796 = getelementptr inbounds float* %tmp17795, i64 1
+  %tmp17797 = getelementptr inbounds float* %tmp17796, i64 1
+  %tmp17798 = getelementptr inbounds float* %tmp17797, i64 1
+  %tmp17799 = getelementptr inbounds float* %tmp17798, i64 1
+  %tmp17800 = getelementptr inbounds float* %tmp17799, i64 1
+  %tmp17801 = getelementptr inbounds float* %tmp17800, i64 1
+  %tmp17802 = getelementptr inbounds float* %tmp17801, i64 1
+  %tmp17803 = getelementptr inbounds float* %tmp17802, i64 1
+  %tmp17804 = getelementptr inbounds float* %tmp17803, i64 1
+  %tmp17805 = getelementptr inbounds float* %tmp17804, i64 1
+  %tmp17806 = getelementptr inbounds float* %tmp17805, i64 1
+  %tmp17807 = getelementptr inbounds float* %tmp17806, i64 1
+  %tmp17808 = getelementptr inbounds float* %tmp17807, i64 1
+  %tmp17809 = getelementptr inbounds float* %tmp17808, i64 1
+  %tmp17810 = getelementptr inbounds float* %tmp17809, i64 1
+  %tmp17811 = getelementptr inbounds float* %tmp17810, i64 1
+  %tmp17812 = getelementptr inbounds float* %tmp17811, i64 1
+  %tmp17813 = getelementptr inbounds float* %tmp17812, i64 1
+  %tmp17814 = getelementptr inbounds float* %tmp17813, i64 1
+  %tmp17815 = getelementptr inbounds float* %tmp17814, i64 1
+  %tmp17816 = getelementptr inbounds float* %tmp17815, i64 1
+  %tmp17817 = getelementptr inbounds float* %tmp17816, i64 1
+  %tmp17818 = getelementptr inbounds float* %tmp17817, i64 1
+  %tmp17819 = getelementptr inbounds float* %tmp17818, i64 1
+  %tmp17820 = getelementptr inbounds float* %tmp17819, i64 1
+  %tmp17821 = getelementptr inbounds float* %tmp17820, i64 1
+  %tmp17822 = getelementptr inbounds float* %tmp17821, i64 1
+  %tmp17823 = getelementptr inbounds float* %tmp17822, i64 1
+  %tmp17824 = getelementptr inbounds float* %tmp17823, i64 1
+  %tmp17825 = getelementptr inbounds float* %tmp17824, i64 1
+  %tmp17826 = getelementptr inbounds float* %tmp17825, i64 1
+  %tmp17827 = getelementptr inbounds float* %tmp17826, i64 1
+  %tmp17828 = getelementptr inbounds float* %tmp17827, i64 1
+  %tmp17829 = getelementptr inbounds float* %tmp17828, i64 1
+  %tmp17830 = getelementptr inbounds float* %tmp17829, i64 1
+  %tmp17831 = getelementptr inbounds float* %tmp17830, i64 1
+  %tmp17832 = getelementptr inbounds float* %tmp17831, i64 1
+  %tmp17833 = getelementptr inbounds float* %tmp17832, i64 1
+  %tmp17834 = getelementptr inbounds float* %tmp17833, i64 1
+  %tmp17835 = getelementptr inbounds float* %tmp17834, i64 1
+  %tmp17836 = getelementptr inbounds float* %tmp17835, i64 1
+  %tmp17837 = getelementptr inbounds float* %tmp17836, i64 1
+  %tmp17838 = getelementptr inbounds float* %tmp17837, i64 1
+  %tmp17839 = getelementptr inbounds float* %tmp17838, i64 1
+  %tmp17840 = getelementptr inbounds float* %tmp17839, i64 1
+  %tmp17841 = getelementptr inbounds float* %tmp17840, i64 1
+  %tmp17842 = getelementptr inbounds float* %tmp17841, i64 1
+  %tmp17843 = getelementptr inbounds float* %tmp17842, i64 1
+  %tmp17844 = getelementptr inbounds float* %tmp17843, i64 1
+  %tmp17845 = getelementptr inbounds float* %tmp17844, i64 1
+  %tmp17846 = getelementptr inbounds float* %tmp17845, i64 1
+  %tmp17847 = getelementptr inbounds float* %tmp17846, i64 1
+  %tmp17848 = getelementptr inbounds float* %tmp17847, i64 1
+  %tmp17849 = getelementptr inbounds float* %tmp17848, i64 1
+  %tmp17850 = getelementptr inbounds float* %tmp17849, i64 1
+  %tmp17851 = getelementptr inbounds float* %tmp17850, i64 1
+  %tmp17852 = getelementptr inbounds float* %tmp17851, i64 1
+  %tmp17853 = getelementptr inbounds float* %tmp17852, i64 1
+  %tmp17854 = getelementptr inbounds float* %tmp17853, i64 1
+  %tmp17855 = getelementptr inbounds float* %tmp17854, i64 1
+  %tmp17856 = getelementptr inbounds float* %tmp17855, i64 1
+  %tmp17857 = getelementptr inbounds float* %tmp17856, i64 1
+  %tmp17858 = getelementptr inbounds float* %tmp17857, i64 1
+  %tmp17859 = getelementptr inbounds float* %tmp17858, i64 1
+  %tmp17860 = getelementptr inbounds float* %tmp17859, i64 1
+  %tmp17861 = getelementptr inbounds float* %tmp17860, i64 1
+  %tmp17862 = getelementptr inbounds float* %tmp17861, i64 1
+  %tmp17863 = getelementptr inbounds float* %tmp17862, i64 1
+  %tmp17864 = getelementptr inbounds float* %tmp17863, i64 1
+  %tmp17865 = getelementptr inbounds float* %tmp17864, i64 1
+  %tmp17866 = getelementptr inbounds float* %tmp17865, i64 1
+  %tmp17867 = getelementptr inbounds float* %tmp17866, i64 1
+  %tmp17868 = getelementptr inbounds float* %tmp17867, i64 1
+  %tmp17869 = getelementptr inbounds float* %tmp17868, i64 1
+  %tmp17870 = getelementptr inbounds float* %tmp17869, i64 1
+  %tmp17871 = getelementptr inbounds float* %tmp17870, i64 1
+  %tmp17872 = getelementptr inbounds float* %tmp17871, i64 1
+  %tmp17873 = getelementptr inbounds float* %tmp17872, i64 1
+  %tmp17874 = getelementptr inbounds float* %tmp17873, i64 1
+  %tmp17875 = getelementptr inbounds float* %tmp17874, i64 1
+  %tmp17876 = getelementptr inbounds float* %tmp17875, i64 1
+  %tmp17877 = getelementptr inbounds float* %tmp17876, i64 1
+  %tmp17878 = getelementptr inbounds float* %tmp17877, i64 1
+  %tmp17879 = getelementptr inbounds float* %tmp17878, i64 1
+  %tmp17880 = getelementptr inbounds float* %tmp17879, i64 1
+  %tmp17881 = getelementptr inbounds float* %tmp17880, i64 1
+  %tmp17882 = getelementptr inbounds float* %tmp17881, i64 1
+  %tmp17883 = getelementptr inbounds float* %tmp17882, i64 1
+  %tmp17884 = getelementptr inbounds float* %tmp17883, i64 1
+  %tmp17885 = getelementptr inbounds float* %tmp17884, i64 1
+  %tmp17886 = getelementptr inbounds float* %tmp17885, i64 1
+  %tmp17887 = getelementptr inbounds float* %tmp17886, i64 1
+  %tmp17888 = getelementptr inbounds float* %tmp17887, i64 1
+  %tmp17889 = getelementptr inbounds float* %tmp17888, i64 1
+  %tmp17890 = getelementptr inbounds float* %tmp17889, i64 1
+  %tmp17891 = getelementptr inbounds float* %tmp17890, i64 1
+  %tmp17892 = getelementptr inbounds float* %tmp17891, i64 1
+  %tmp17893 = getelementptr inbounds float* %tmp17892, i64 1
+  %tmp17894 = getelementptr inbounds float* %tmp17893, i64 1
+  %tmp17895 = getelementptr inbounds float* %tmp17894, i64 1
+  %tmp17896 = getelementptr inbounds float* %tmp17895, i64 1
+  %tmp17897 = getelementptr inbounds float* %tmp17896, i64 1
+  %tmp17898 = getelementptr inbounds float* %tmp17897, i64 1
+  %tmp17899 = getelementptr inbounds float* %tmp17898, i64 1
+  %tmp17900 = getelementptr inbounds float* %tmp17899, i64 1
+  %tmp17901 = getelementptr inbounds float* %tmp17900, i64 1
+  %tmp17902 = getelementptr inbounds float* %tmp17901, i64 1
+  %tmp17903 = getelementptr inbounds float* %tmp17902, i64 1
+  %tmp17904 = getelementptr inbounds float* %tmp17903, i64 1
+  %tmp17905 = getelementptr inbounds float* %tmp17904, i64 1
+  %tmp17906 = getelementptr inbounds float* %tmp17905, i64 1
+  %tmp17907 = getelementptr inbounds float* %tmp17906, i64 1
+  %tmp17908 = getelementptr inbounds float* %tmp17907, i64 1
+  %tmp17909 = getelementptr inbounds float* %tmp17908, i64 1
+  %tmp17910 = getelementptr inbounds float* %tmp17909, i64 1
+  %tmp17911 = getelementptr inbounds float* %tmp17910, i64 1
+  %tmp17912 = getelementptr inbounds float* %tmp17911, i64 1
+  %tmp17913 = getelementptr inbounds float* %tmp17912, i64 1
+  %tmp17914 = getelementptr inbounds float* %tmp17913, i64 1
+  %tmp17915 = getelementptr inbounds float* %tmp17914, i64 1
+  %tmp17916 = getelementptr inbounds float* %tmp17915, i64 1
+  %tmp17917 = getelementptr inbounds float* %tmp17916, i64 1
+  %tmp17918 = getelementptr inbounds float* %tmp17917, i64 1
+  %tmp17919 = getelementptr inbounds float* %tmp17918, i64 1
+  %tmp17920 = getelementptr inbounds float* %tmp17919, i64 1
+  %tmp17921 = getelementptr inbounds float* %tmp17920, i64 1
+  %tmp17922 = getelementptr inbounds float* %tmp17921, i64 1
+  %tmp17923 = getelementptr inbounds float* %tmp17922, i64 1
+  %tmp17924 = getelementptr inbounds float* %tmp17923, i64 1
+  %tmp17925 = getelementptr inbounds float* %tmp17924, i64 1
+  %tmp17926 = getelementptr inbounds float* %tmp17925, i64 1
+  %tmp17927 = getelementptr inbounds float* %tmp17926, i64 1
+  %tmp17928 = getelementptr inbounds float* %tmp17927, i64 1
+  %tmp17929 = getelementptr inbounds float* %tmp17928, i64 1
+  %tmp17930 = getelementptr inbounds float* %tmp17929, i64 1
+  %tmp17931 = getelementptr inbounds float* %tmp17930, i64 1
+  %tmp17932 = getelementptr inbounds float* %tmp17931, i64 1
+  %tmp17933 = getelementptr inbounds float* %tmp17932, i64 1
+  %tmp17934 = getelementptr inbounds float* %tmp17933, i64 1
+  %tmp17935 = getelementptr inbounds float* %tmp17934, i64 1
+  %tmp17936 = getelementptr inbounds float* %tmp17935, i64 1
+  %tmp17937 = getelementptr inbounds float* %tmp17936, i64 1
+  %tmp17938 = getelementptr inbounds float* %tmp17937, i64 1
+  %tmp17939 = getelementptr inbounds float* %tmp17938, i64 1
+  %tmp17940 = getelementptr inbounds float* %tmp17939, i64 1
+  %tmp17941 = getelementptr inbounds float* %tmp17940, i64 1
+  %tmp17942 = getelementptr inbounds float* %tmp17941, i64 1
+  %tmp17943 = getelementptr inbounds float* %tmp17942, i64 1
+  %tmp17944 = getelementptr inbounds float* %tmp17943, i64 1
+  %tmp17945 = getelementptr inbounds float* %tmp17944, i64 1
+  %tmp17946 = getelementptr inbounds float* %tmp17945, i64 1
+  %tmp17947 = getelementptr inbounds float* %tmp17946, i64 1
+  %tmp17948 = getelementptr inbounds float* %tmp17947, i64 1
+  %tmp17949 = getelementptr inbounds float* %tmp17948, i64 1
+  %tmp17950 = getelementptr inbounds float* %tmp17949, i64 1
+  %tmp17951 = getelementptr inbounds float* %tmp17950, i64 1
+  %tmp17952 = getelementptr inbounds float* %tmp17951, i64 1
+  %tmp17953 = getelementptr inbounds float* %tmp17952, i64 1
+  %tmp17954 = getelementptr inbounds float* %tmp17953, i64 1
+  %tmp17955 = getelementptr inbounds float* %tmp17954, i64 1
+  %tmp17956 = getelementptr inbounds float* %tmp17955, i64 1
+  %tmp17957 = getelementptr inbounds float* %tmp17956, i64 1
+  %tmp17958 = getelementptr inbounds float* %tmp17957, i64 1
+  %tmp17959 = getelementptr inbounds float* %tmp17958, i64 1
+  %tmp17960 = getelementptr inbounds float* %tmp17959, i64 1
+  %tmp17961 = getelementptr inbounds float* %tmp17960, i64 1
+  %tmp17962 = getelementptr inbounds float* %tmp17961, i64 1
+  %tmp17963 = getelementptr inbounds float* %tmp17962, i64 1
+  %tmp17964 = getelementptr inbounds float* %tmp17963, i64 1
+  %tmp17965 = getelementptr inbounds float* %tmp17964, i64 1
+  %tmp17966 = getelementptr inbounds float* %tmp17965, i64 1
+  %tmp17967 = getelementptr inbounds float* %tmp17966, i64 1
+  %tmp17968 = getelementptr inbounds float* %tmp17967, i64 1
+  %tmp17969 = getelementptr inbounds float* %tmp17968, i64 1
+  %tmp17970 = getelementptr inbounds float* %tmp17969, i64 1
+  %tmp17971 = getelementptr inbounds float* %tmp17970, i64 1
+  %tmp17972 = getelementptr inbounds float* %tmp17971, i64 1
+  %tmp17973 = getelementptr inbounds float* %tmp17972, i64 1
+  %tmp17974 = getelementptr inbounds float* %tmp17973, i64 1
+  %tmp17975 = getelementptr inbounds float* %tmp17974, i64 1
+  %tmp17976 = getelementptr inbounds float* %tmp17975, i64 1
+  %tmp17977 = getelementptr inbounds float* %tmp17976, i64 1
+  %tmp17978 = getelementptr inbounds float* %tmp17977, i64 1
+  %tmp17979 = getelementptr inbounds float* %tmp17978, i64 1
+  %tmp17980 = getelementptr inbounds float* %tmp17979, i64 1
+  %tmp17981 = getelementptr inbounds float* %tmp17980, i64 1
+  %tmp17982 = getelementptr inbounds float* %tmp17981, i64 1
+  %tmp17983 = getelementptr inbounds float* %tmp17982, i64 1
+  %tmp17984 = getelementptr inbounds float* %tmp17983, i64 1
+  %tmp17985 = getelementptr inbounds float* %tmp17984, i64 1
+  %tmp17986 = getelementptr inbounds float* %tmp17985, i64 1
+  %tmp17987 = getelementptr inbounds float* %tmp17986, i64 1
+  %tmp17988 = getelementptr inbounds float* %tmp17987, i64 1
+  %tmp17989 = getelementptr inbounds float* %tmp17988, i64 1
+  %tmp17990 = getelementptr inbounds float* %tmp17989, i64 1
+  %tmp17991 = getelementptr inbounds float* %tmp17990, i64 1
+  %tmp17992 = getelementptr inbounds float* %tmp17991, i64 1
+  %tmp17993 = getelementptr inbounds float* %tmp17992, i64 1
+  %tmp17994 = getelementptr inbounds float* %tmp17993, i64 1
+  %tmp17995 = getelementptr inbounds float* %tmp17994, i64 1
+  %tmp17996 = getelementptr inbounds float* %tmp17995, i64 1
+  %tmp17997 = getelementptr inbounds float* %tmp17996, i64 1
+  %tmp17998 = getelementptr inbounds float* %tmp17997, i64 1
+  %tmp17999 = getelementptr inbounds float* %tmp17998, i64 1
+  %tmp18000 = getelementptr inbounds float* %tmp17999, i64 1
+  %tmp18001 = getelementptr inbounds float* %tmp18000, i64 1
+  %tmp18002 = getelementptr inbounds float* %tmp18001, i64 1
+  %tmp18003 = getelementptr inbounds float* %tmp18002, i64 1
+  %tmp18004 = getelementptr inbounds float* %tmp18003, i64 1
+  %tmp18005 = getelementptr inbounds float* %tmp18004, i64 1
+  %tmp18006 = getelementptr inbounds float* %tmp18005, i64 1
+  %tmp18007 = getelementptr inbounds float* %tmp18006, i64 1
+  %tmp18008 = getelementptr inbounds float* %tmp18007, i64 1
+  %tmp18009 = getelementptr inbounds float* %tmp18008, i64 1
+  %tmp18010 = getelementptr inbounds float* %tmp18009, i64 1
+  %tmp18011 = getelementptr inbounds float* %tmp18010, i64 1
+  %tmp18012 = getelementptr inbounds float* %tmp18011, i64 1
+  %tmp18013 = getelementptr inbounds float* %tmp18012, i64 1
+  %tmp18014 = getelementptr inbounds float* %tmp18013, i64 1
+  %tmp18015 = getelementptr inbounds float* %tmp18014, i64 1
+  %tmp18016 = getelementptr inbounds float* %tmp18015, i64 1
+  %tmp18017 = getelementptr inbounds float* %tmp18016, i64 1
+  %tmp18018 = getelementptr inbounds float* %tmp18017, i64 1
+  %tmp18019 = getelementptr inbounds float* %tmp18018, i64 1
+  %tmp18020 = getelementptr inbounds float* %tmp18019, i64 1
+  %tmp18021 = getelementptr inbounds float* %tmp18020, i64 1
+  %tmp18022 = getelementptr inbounds float* %tmp18021, i64 1
+  %tmp18023 = getelementptr inbounds float* %tmp18022, i64 1
+  %tmp18024 = getelementptr inbounds float* %tmp18023, i64 1
+  %tmp18025 = getelementptr inbounds float* %tmp18024, i64 1
+  %tmp18026 = getelementptr inbounds float* %tmp18025, i64 1
+  %tmp18027 = getelementptr inbounds float* %tmp18026, i64 1
+  %tmp18028 = getelementptr inbounds float* %tmp18027, i64 1
+  %tmp18029 = getelementptr inbounds float* %tmp18028, i64 1
+  %tmp18030 = getelementptr inbounds float* %tmp18029, i64 1
+  %tmp18031 = getelementptr inbounds float* %tmp18030, i64 1
+  %tmp18032 = getelementptr inbounds float* %tmp18031, i64 1
+  %tmp18033 = getelementptr inbounds float* %tmp18032, i64 1
+  %tmp18034 = getelementptr inbounds float* %tmp18033, i64 1
+  %tmp18035 = getelementptr inbounds float* %tmp18034, i64 1
+  %tmp18036 = getelementptr inbounds float* %tmp18035, i64 1
+  %tmp18037 = getelementptr inbounds float* %tmp18036, i64 1
+  %tmp18038 = getelementptr inbounds float* %tmp18037, i64 1
+  %tmp18039 = getelementptr inbounds float* %tmp18038, i64 1
+  %tmp18040 = getelementptr inbounds float* %tmp18039, i64 1
+  %tmp18041 = getelementptr inbounds float* %tmp18040, i64 1
+  %tmp18042 = getelementptr inbounds float* %tmp18041, i64 1
+  %tmp18043 = getelementptr inbounds float* %tmp18042, i64 1
+  %tmp18044 = getelementptr inbounds float* %tmp18043, i64 1
+  %tmp18045 = getelementptr inbounds float* %tmp18044, i64 1
+  %tmp18046 = getelementptr inbounds float* %tmp18045, i64 1
+  %tmp18047 = getelementptr inbounds float* %tmp18046, i64 1
+  %tmp18048 = getelementptr inbounds float* %tmp18047, i64 1
+  %tmp18049 = getelementptr inbounds float* %tmp18048, i64 1
+  %tmp18050 = getelementptr inbounds float* %tmp18049, i64 1
+  %tmp18051 = getelementptr inbounds float* %tmp18050, i64 1
+  %tmp18052 = getelementptr inbounds float* %tmp18051, i64 1
+  %tmp18053 = getelementptr inbounds float* %tmp18052, i64 1
+  %tmp18054 = getelementptr inbounds float* %tmp18053, i64 1
+  %tmp18055 = getelementptr inbounds float* %tmp18054, i64 1
+  %tmp18056 = getelementptr inbounds float* %tmp18055, i64 1
+  %tmp18057 = getelementptr inbounds float* %tmp18056, i64 1
+  %tmp18058 = getelementptr inbounds float* %tmp18057, i64 1
+  %tmp18059 = getelementptr inbounds float* %tmp18058, i64 1
+  %tmp18060 = getelementptr inbounds float* %tmp18059, i64 1
+  %tmp18061 = getelementptr inbounds float* %tmp18060, i64 1
+  %tmp18062 = getelementptr inbounds float* %tmp18061, i64 1
+  %tmp18063 = getelementptr inbounds float* %tmp18062, i64 1
+  %tmp18064 = getelementptr inbounds float* %tmp18063, i64 1
+  %tmp18065 = getelementptr inbounds float* %tmp18064, i64 1
+  %tmp18066 = getelementptr inbounds float* %tmp18065, i64 1
+  %tmp18067 = getelementptr inbounds float* %tmp18066, i64 1
+  %tmp18068 = getelementptr inbounds float* %tmp18067, i64 1
+  %tmp18069 = getelementptr inbounds float* %tmp18068, i64 1
+  %tmp18070 = getelementptr inbounds float* %tmp18069, i64 1
+  %tmp18071 = getelementptr inbounds float* %tmp18070, i64 1
+  %tmp18072 = getelementptr inbounds float* %tmp18071, i64 1
+  %tmp18073 = getelementptr inbounds float* %tmp18072, i64 1
+  %tmp18074 = getelementptr inbounds float* %tmp18073, i64 1
+  %tmp18075 = getelementptr inbounds float* %tmp18074, i64 1
+  %tmp18076 = getelementptr inbounds float* %tmp18075, i64 1
+  %tmp18077 = getelementptr inbounds float* %tmp18076, i64 1
+  %tmp18078 = getelementptr inbounds float* %tmp18077, i64 1
+  %tmp18079 = getelementptr inbounds float* %tmp18078, i64 1
+  %tmp18080 = getelementptr inbounds float* %tmp18079, i64 1
+  %tmp18081 = getelementptr inbounds float* %tmp18080, i64 1
+  %tmp18082 = getelementptr inbounds float* %tmp18081, i64 1
+  %tmp18083 = getelementptr inbounds float* %tmp18082, i64 1
+  %tmp18084 = getelementptr inbounds float* %tmp18083, i64 1
+  %tmp18085 = getelementptr inbounds float* %tmp18084, i64 1
+  %tmp18086 = getelementptr inbounds float* %tmp18085, i64 1
+  %tmp18087 = getelementptr inbounds float* %tmp18086, i64 1
+  %tmp18088 = getelementptr inbounds float* %tmp18087, i64 1
+  %tmp18089 = getelementptr inbounds float* %tmp18088, i64 1
+  %tmp18090 = getelementptr inbounds float* %tmp18089, i64 1
+  %tmp18091 = getelementptr inbounds float* %tmp18090, i64 1
+  %tmp18092 = getelementptr inbounds float* %tmp18091, i64 1
+  %tmp18093 = getelementptr inbounds float* %tmp18092, i64 1
+  %tmp18094 = getelementptr inbounds float* %tmp18093, i64 1
+  %tmp18095 = getelementptr inbounds float* %tmp18094, i64 1
+  %tmp18096 = getelementptr inbounds float* %tmp18095, i64 1
+  %tmp18097 = getelementptr inbounds float* %tmp18096, i64 1
+  %tmp18098 = getelementptr inbounds float* %tmp18097, i64 1
+  %tmp18099 = getelementptr inbounds float* %tmp18098, i64 1
+  %tmp18100 = getelementptr inbounds float* %tmp18099, i64 1
+  %tmp18101 = getelementptr inbounds float* %tmp18100, i64 1
+  %tmp18102 = getelementptr inbounds float* %tmp18101, i64 1
+  %tmp18103 = getelementptr inbounds float* %tmp18102, i64 1
+  %tmp18104 = getelementptr inbounds float* %tmp18103, i64 1
+  %tmp18105 = getelementptr inbounds float* %tmp18104, i64 1
+  %tmp18106 = getelementptr inbounds float* %tmp18105, i64 1
+  %tmp18107 = getelementptr inbounds float* %tmp18106, i64 1
+  %tmp18108 = getelementptr inbounds float* %tmp18107, i64 1
+  %tmp18109 = getelementptr inbounds float* %tmp18108, i64 1
+  %tmp18110 = getelementptr inbounds float* %tmp18109, i64 1
+  %tmp18111 = getelementptr inbounds float* %tmp18110, i64 1
+  %tmp18112 = getelementptr inbounds float* %tmp18111, i64 1
+  %tmp18113 = getelementptr inbounds float* %tmp18112, i64 1
+  %tmp18114 = getelementptr inbounds float* %tmp18113, i64 1
+  %tmp18115 = getelementptr inbounds float* %tmp18114, i64 1
+  %tmp18116 = getelementptr inbounds float* %tmp18115, i64 1
+  %tmp18117 = getelementptr inbounds float* %tmp18116, i64 1
+  %tmp18118 = getelementptr inbounds float* %tmp18117, i64 1
+  %tmp18119 = getelementptr inbounds float* %tmp18118, i64 1
+  %tmp18120 = getelementptr inbounds float* %tmp18119, i64 1
+  %tmp18121 = getelementptr inbounds float* %tmp18120, i64 1
+  %tmp18122 = getelementptr inbounds float* %tmp18121, i64 1
+  %tmp18123 = getelementptr inbounds float* %tmp18122, i64 1
+  %tmp18124 = getelementptr inbounds float* %tmp18123, i64 1
+  %tmp18125 = getelementptr inbounds float* %tmp18124, i64 1
+  %tmp18126 = getelementptr inbounds float* %tmp18125, i64 1
+  %tmp18127 = getelementptr inbounds float* %tmp18126, i64 1
+  %tmp18128 = getelementptr inbounds float* %tmp18127, i64 1
+  %tmp18129 = getelementptr inbounds float* %tmp18128, i64 1
+  %tmp18130 = getelementptr inbounds float* %tmp18129, i64 1
+  %tmp18131 = getelementptr inbounds float* %tmp18130, i64 1
+  %tmp18132 = getelementptr inbounds float* %tmp18131, i64 1
+  %tmp18133 = getelementptr inbounds float* %tmp18132, i64 1
+  %tmp18134 = getelementptr inbounds float* %tmp18133, i64 1
+  %tmp18135 = getelementptr inbounds float* %tmp18134, i64 1
+  %tmp18136 = getelementptr inbounds float* %tmp18135, i64 1
+  %tmp18137 = getelementptr inbounds float* %tmp18136, i64 1
+  %tmp18138 = getelementptr inbounds float* %tmp18137, i64 1
+  %tmp18139 = getelementptr inbounds float* %tmp18138, i64 1
+  %tmp18140 = getelementptr inbounds float* %tmp18139, i64 1
+  %tmp18141 = getelementptr inbounds float* %tmp18140, i64 1
+  %tmp18142 = getelementptr inbounds float* %tmp18141, i64 1
+  %tmp18143 = getelementptr inbounds float* %tmp18142, i64 1
+  %tmp18144 = getelementptr inbounds float* %tmp18143, i64 1
+  %tmp18145 = getelementptr inbounds float* %tmp18144, i64 1
+  %tmp18146 = getelementptr inbounds float* %tmp18145, i64 1
+  %tmp18147 = getelementptr inbounds float* %tmp18146, i64 1
+  %tmp18148 = getelementptr inbounds float* %tmp18147, i64 1
+  %tmp18149 = getelementptr inbounds float* %tmp18148, i64 1
+  %tmp18150 = getelementptr inbounds float* %tmp18149, i64 1
+  %tmp18151 = getelementptr inbounds float* %tmp18150, i64 1
+  %tmp18152 = getelementptr inbounds float* %tmp18151, i64 1
+  %tmp18153 = getelementptr inbounds float* %tmp18152, i64 1
+  %tmp18154 = getelementptr inbounds float* %tmp18153, i64 1
+  %tmp18155 = getelementptr inbounds float* %tmp18154, i64 1
+  %tmp18156 = getelementptr inbounds float* %tmp18155, i64 1
+  %tmp18157 = getelementptr inbounds float* %tmp18156, i64 1
+  %tmp18158 = getelementptr inbounds float* %tmp18157, i64 1
+  %tmp18159 = getelementptr inbounds float* %tmp18158, i64 1
+  %tmp18160 = getelementptr inbounds float* %tmp18159, i64 1
+  %tmp18161 = getelementptr inbounds float* %tmp18160, i64 1
+  %tmp18162 = getelementptr inbounds float* %tmp18161, i64 1
+  %tmp18163 = getelementptr inbounds float* %tmp18162, i64 1
+  %tmp18164 = getelementptr inbounds float* %tmp18163, i64 1
+  %tmp18165 = getelementptr inbounds float* %tmp18164, i64 1
+  %tmp18166 = getelementptr inbounds float* %tmp18165, i64 1
+  %tmp18167 = getelementptr inbounds float* %tmp18166, i64 1
+  %tmp18168 = getelementptr inbounds float* %tmp18167, i64 1
+  %tmp18169 = getelementptr inbounds float* %tmp18168, i64 1
+  %tmp18170 = getelementptr inbounds float* %tmp18169, i64 1
+  %tmp18171 = getelementptr inbounds float* %tmp18170, i64 1
+  %tmp18172 = getelementptr inbounds float* %tmp18171, i64 1
+  %tmp18173 = getelementptr inbounds float* %tmp18172, i64 1
+  %tmp18174 = getelementptr inbounds float* %tmp18173, i64 1
+  %tmp18175 = getelementptr inbounds float* %tmp18174, i64 1
+  %tmp18176 = getelementptr inbounds float* %tmp18175, i64 1
+  %tmp18177 = getelementptr inbounds float* %tmp18176, i64 1
+  %tmp18178 = getelementptr inbounds float* %tmp18177, i64 1
+  %tmp18179 = getelementptr inbounds float* %tmp18178, i64 1
+  %tmp18180 = getelementptr inbounds float* %tmp18179, i64 1
+  %tmp18181 = getelementptr inbounds float* %tmp18180, i64 1
+  %tmp18182 = getelementptr inbounds float* %tmp18181, i64 1
+  %tmp18183 = getelementptr inbounds float* %tmp18182, i64 1
+  %tmp18184 = getelementptr inbounds float* %tmp18183, i64 1
+  %tmp18185 = getelementptr inbounds float* %tmp18184, i64 1
+  %tmp18186 = getelementptr inbounds float* %tmp18185, i64 1
+  %tmp18187 = getelementptr inbounds float* %tmp18186, i64 1
+  %tmp18188 = getelementptr inbounds float* %tmp18187, i64 1
+  %tmp18189 = getelementptr inbounds float* %tmp18188, i64 1
+  %tmp18190 = getelementptr inbounds float* %tmp18189, i64 1
+  %tmp18191 = getelementptr inbounds float* %tmp18190, i64 1
+  %tmp18192 = getelementptr inbounds float* %tmp18191, i64 1
+  %tmp18193 = getelementptr inbounds float* %tmp18192, i64 1
+  %tmp18194 = getelementptr inbounds float* %tmp18193, i64 1
+  %tmp18195 = getelementptr inbounds float* %tmp18194, i64 1
+  %tmp18196 = getelementptr inbounds float* %tmp18195, i64 1
+  %tmp18197 = getelementptr inbounds float* %tmp18196, i64 1
+  %tmp18198 = getelementptr inbounds float* %tmp18197, i64 1
+  %tmp18199 = getelementptr inbounds float* %tmp18198, i64 1
+  %tmp18200 = getelementptr inbounds float* %tmp18199, i64 1
+  %tmp18201 = getelementptr inbounds float* %tmp18200, i64 1
+  %tmp18202 = getelementptr inbounds float* %tmp18201, i64 1
+  %tmp18203 = getelementptr inbounds float* %tmp18202, i64 1
+  %tmp18204 = getelementptr inbounds float* %tmp18203, i64 1
+  %tmp18205 = getelementptr inbounds float* %tmp18204, i64 1
+  %tmp18206 = getelementptr inbounds float* %tmp18205, i64 1
+  %tmp18207 = getelementptr inbounds float* %tmp18206, i64 1
+  %tmp18208 = getelementptr inbounds float* %tmp18207, i64 1
+  %tmp18209 = getelementptr inbounds float* %tmp18208, i64 1
+  %tmp18210 = getelementptr inbounds float* %tmp18209, i64 1
+  %tmp18211 = getelementptr inbounds float* %tmp18210, i64 1
+  %tmp18212 = getelementptr inbounds float* %tmp18211, i64 1
+  %tmp18213 = getelementptr inbounds float* %tmp18212, i64 1
+  %tmp18214 = getelementptr inbounds float* %tmp18213, i64 1
+  %tmp18215 = getelementptr inbounds float* %tmp18214, i64 1
+  %tmp18216 = getelementptr inbounds float* %tmp18215, i64 1
+  %tmp18217 = getelementptr inbounds float* %tmp18216, i64 1
+  %tmp18218 = getelementptr inbounds float* %tmp18217, i64 1
+  %tmp18219 = getelementptr inbounds float* %tmp18218, i64 1
+  %tmp18220 = getelementptr inbounds float* %tmp18219, i64 1
+  %tmp18221 = getelementptr inbounds float* %tmp18220, i64 1
+  %tmp18222 = getelementptr inbounds float* %tmp18221, i64 1
+  %tmp18223 = getelementptr inbounds float* %tmp18222, i64 1
+  %tmp18224 = getelementptr inbounds float* %tmp18223, i64 1
+  %tmp18225 = getelementptr inbounds float* %tmp18224, i64 1
+  %tmp18226 = getelementptr inbounds float* %tmp18225, i64 1
+  %tmp18227 = getelementptr inbounds float* %tmp18226, i64 1
+  %tmp18228 = getelementptr inbounds float* %tmp18227, i64 1
+  %tmp18229 = getelementptr inbounds float* %tmp18228, i64 1
+  %tmp18230 = getelementptr inbounds float* %tmp18229, i64 1
+  %tmp18231 = getelementptr inbounds float* %tmp18230, i64 1
+  %tmp18232 = getelementptr inbounds float* %tmp18231, i64 1
+  %tmp18233 = getelementptr inbounds float* %tmp18232, i64 1
+  %tmp18234 = getelementptr inbounds float* %tmp18233, i64 1
+  %tmp18235 = getelementptr inbounds float* %tmp18234, i64 1
+  %tmp18236 = getelementptr inbounds float* %tmp18235, i64 1
+  %tmp18237 = getelementptr inbounds float* %tmp18236, i64 1
+  %tmp18238 = getelementptr inbounds float* %tmp18237, i64 1
+  %tmp18239 = getelementptr inbounds float* %tmp18238, i64 1
+  %tmp18240 = getelementptr inbounds float* %tmp18239, i64 1
+  %tmp18241 = getelementptr inbounds float* %tmp18240, i64 1
+  %tmp18242 = getelementptr inbounds float* %tmp18241, i64 1
+  %tmp18243 = getelementptr inbounds float* %tmp18242, i64 1
+  %tmp18244 = getelementptr inbounds float* %tmp18243, i64 1
+  %tmp18245 = getelementptr inbounds float* %tmp18244, i64 1
+  %tmp18246 = getelementptr inbounds float* %tmp18245, i64 1
+  %tmp18247 = getelementptr inbounds float* %tmp18246, i64 1
+  %tmp18248 = getelementptr inbounds float* %tmp18247, i64 1
+  %tmp18249 = getelementptr inbounds float* %tmp18248, i64 1
+  %tmp18250 = getelementptr inbounds float* %tmp18249, i64 1
+  %tmp18251 = getelementptr inbounds float* %tmp18250, i64 1
+  %tmp18252 = getelementptr inbounds float* %tmp18251, i64 1
+  %tmp18253 = getelementptr inbounds float* %tmp18252, i64 1
+  %tmp18254 = getelementptr inbounds float* %tmp18253, i64 1
+  %tmp18255 = getelementptr inbounds float* %tmp18254, i64 1
+  %tmp18256 = getelementptr inbounds float* %tmp18255, i64 1
+  %tmp18257 = getelementptr inbounds float* %tmp18256, i64 1
+  %tmp18258 = getelementptr inbounds float* %tmp18257, i64 1
+  %tmp18259 = getelementptr inbounds float* %tmp18258, i64 1
+  %tmp18260 = getelementptr inbounds float* %tmp18259, i64 1
+  %tmp18261 = getelementptr inbounds float* %tmp18260, i64 1
+  %tmp18262 = getelementptr inbounds float* %tmp18261, i64 1
+  %tmp18263 = getelementptr inbounds float* %tmp18262, i64 1
+  %tmp18264 = getelementptr inbounds float* %tmp18263, i64 1
+  %tmp18265 = getelementptr inbounds float* %tmp18264, i64 1
+  %tmp18266 = getelementptr inbounds float* %tmp18265, i64 1
+  %tmp18267 = getelementptr inbounds float* %tmp18266, i64 1
+  %tmp18268 = getelementptr inbounds float* %tmp18267, i64 1
+  %tmp18269 = getelementptr inbounds float* %tmp18268, i64 1
+  %tmp18270 = getelementptr inbounds float* %tmp18269, i64 1
+  %tmp18271 = getelementptr inbounds float* %tmp18270, i64 1
+  %tmp18272 = getelementptr inbounds float* %tmp18271, i64 1
+  %tmp18273 = getelementptr inbounds float* %tmp18272, i64 1
+  %tmp18274 = getelementptr inbounds float* %tmp18273, i64 1
+  %tmp18275 = getelementptr inbounds float* %tmp18274, i64 1
+  %tmp18276 = getelementptr inbounds float* %tmp18275, i64 1
+  %tmp18277 = getelementptr inbounds float* %tmp18276, i64 1
+  %tmp18278 = getelementptr inbounds float* %tmp18277, i64 1
+  %tmp18279 = getelementptr inbounds float* %tmp18278, i64 1
+  %tmp18280 = getelementptr inbounds float* %tmp18279, i64 1
+  %tmp18281 = getelementptr inbounds float* %tmp18280, i64 1
+  %tmp18282 = getelementptr inbounds float* %tmp18281, i64 1
+  %tmp18283 = getelementptr inbounds float* %tmp18282, i64 1
+  %tmp18284 = getelementptr inbounds float* %tmp18283, i64 1
+  %tmp18285 = getelementptr inbounds float* %tmp18284, i64 1
+  %tmp18286 = getelementptr inbounds float* %tmp18285, i64 1
+  %tmp18287 = getelementptr inbounds float* %tmp18286, i64 1
+  %tmp18288 = getelementptr inbounds float* %tmp18287, i64 1
+  %tmp18289 = getelementptr inbounds float* %tmp18288, i64 1
+  %tmp18290 = getelementptr inbounds float* %tmp18289, i64 1
+  %tmp18291 = getelementptr inbounds float* %tmp18290, i64 1
+  %tmp18292 = getelementptr inbounds float* %tmp18291, i64 1
+  %tmp18293 = getelementptr inbounds float* %tmp18292, i64 1
+  %tmp18294 = getelementptr inbounds float* %tmp18293, i64 1
+  %tmp18295 = getelementptr inbounds float* %tmp18294, i64 1
+  %tmp18296 = getelementptr inbounds float* %tmp18295, i64 1
+  %tmp18297 = getelementptr inbounds float* %tmp18296, i64 1
+  %tmp18298 = getelementptr inbounds float* %tmp18297, i64 1
+  %tmp18299 = getelementptr inbounds float* %tmp18298, i64 1
+  %tmp18300 = getelementptr inbounds float* %tmp18299, i64 1
+  %tmp18301 = getelementptr inbounds float* %tmp18300, i64 1
+  %tmp18302 = getelementptr inbounds float* %tmp18301, i64 1
+  %tmp18303 = getelementptr inbounds float* %tmp18302, i64 1
+  %tmp18304 = getelementptr inbounds float* %tmp18303, i64 1
+  %tmp18305 = getelementptr inbounds float* %tmp18304, i64 1
+  %tmp18306 = getelementptr inbounds float* %tmp18305, i64 1
+  %tmp18307 = getelementptr inbounds float* %tmp18306, i64 1
+  %tmp18308 = getelementptr inbounds float* %tmp18307, i64 1
+  %tmp18309 = getelementptr inbounds float* %tmp18308, i64 1
+  %tmp18310 = getelementptr inbounds float* %tmp18309, i64 1
+  %tmp18311 = getelementptr inbounds float* %tmp18310, i64 1
+  %tmp18312 = getelementptr inbounds float* %tmp18311, i64 1
+  %tmp18313 = getelementptr inbounds float* %tmp18312, i64 1
+  %tmp18314 = getelementptr inbounds float* %tmp18313, i64 1
+  %tmp18315 = getelementptr inbounds float* %tmp18314, i64 1
+  %tmp18316 = getelementptr inbounds float* %tmp18315, i64 1
+  %tmp18317 = getelementptr inbounds float* %tmp18316, i64 1
+  %tmp18318 = getelementptr inbounds float* %tmp18317, i64 1
+  %tmp18319 = getelementptr inbounds float* %tmp18318, i64 1
+  %tmp18320 = getelementptr inbounds float* %tmp18319, i64 1
+  %tmp18321 = getelementptr inbounds float* %tmp18320, i64 1
+  %tmp18322 = getelementptr inbounds float* %tmp18321, i64 1
+  %tmp18323 = getelementptr inbounds float* %tmp18322, i64 1
+  %tmp18324 = getelementptr inbounds float* %tmp18323, i64 1
+  %tmp18325 = getelementptr inbounds float* %tmp18324, i64 1
+  %tmp18326 = getelementptr inbounds float* %tmp18325, i64 1
+  %tmp18327 = getelementptr inbounds float* %tmp18326, i64 1
+  %tmp18328 = getelementptr inbounds float* %tmp18327, i64 1
+  %tmp18329 = getelementptr inbounds float* %tmp18328, i64 1
+  %tmp18330 = getelementptr inbounds float* %tmp18329, i64 1
+  %tmp18331 = getelementptr inbounds float* %tmp18330, i64 1
+  %tmp18332 = getelementptr inbounds float* %tmp18331, i64 1
+  %tmp18333 = getelementptr inbounds float* %tmp18332, i64 1
+  %tmp18334 = getelementptr inbounds float* %tmp18333, i64 1
+  %tmp18335 = getelementptr inbounds float* %tmp18334, i64 1
+  %tmp18336 = getelementptr inbounds float* %tmp18335, i64 1
+  %tmp18337 = getelementptr inbounds float* %tmp18336, i64 1
+  %tmp18338 = getelementptr inbounds float* %tmp18337, i64 1
+  %tmp18339 = getelementptr inbounds float* %tmp18338, i64 1
+  %tmp18340 = getelementptr inbounds float* %tmp18339, i64 1
+  %tmp18341 = getelementptr inbounds float* %tmp18340, i64 1
+  %tmp18342 = getelementptr inbounds float* %tmp18341, i64 1
+  %tmp18343 = getelementptr inbounds float* %tmp18342, i64 1
+  %tmp18344 = getelementptr inbounds float* %tmp18343, i64 1
+  %tmp18345 = getelementptr inbounds float* %tmp18344, i64 1
+  %tmp18346 = getelementptr inbounds float* %tmp18345, i64 1
+  %tmp18347 = getelementptr inbounds float* %tmp18346, i64 1
+  %tmp18348 = getelementptr inbounds float* %tmp18347, i64 1
+  %tmp18349 = getelementptr inbounds float* %tmp18348, i64 1
+  %tmp18350 = getelementptr inbounds float* %tmp18349, i64 1
+  %tmp18351 = getelementptr inbounds float* %tmp18350, i64 1
+  %tmp18352 = getelementptr inbounds float* %tmp18351, i64 1
+  %tmp18353 = getelementptr inbounds float* %tmp18352, i64 1
+  %tmp18354 = getelementptr inbounds float* %tmp18353, i64 1
+  %tmp18355 = getelementptr inbounds float* %tmp18354, i64 1
+  %tmp18356 = getelementptr inbounds float* %tmp18355, i64 1
+  %tmp18357 = getelementptr inbounds float* %tmp18356, i64 1
+  %tmp18358 = getelementptr inbounds float* %tmp18357, i64 1
+  %tmp18359 = getelementptr inbounds float* %tmp18358, i64 1
+  %tmp18360 = getelementptr inbounds float* %tmp18359, i64 1
+  %tmp18361 = getelementptr inbounds float* %tmp18360, i64 1
+  %tmp18362 = getelementptr inbounds float* %tmp18361, i64 1
+  %tmp18363 = getelementptr inbounds float* %tmp18362, i64 1
+  %tmp18364 = getelementptr inbounds float* %tmp18363, i64 1
+  %tmp18365 = getelementptr inbounds float* %tmp18364, i64 1
+  %tmp18366 = getelementptr inbounds float* %tmp18365, i64 1
+  %tmp18367 = getelementptr inbounds float* %tmp18366, i64 1
+  %tmp18368 = getelementptr inbounds float* %tmp18367, i64 1
+  %tmp18369 = getelementptr inbounds float* %tmp18368, i64 1
+  %tmp18370 = getelementptr inbounds float* %tmp18369, i64 1
+  %tmp18371 = getelementptr inbounds float* %tmp18370, i64 1
+  %tmp18372 = getelementptr inbounds float* %tmp18371, i64 1
+  %tmp18373 = getelementptr inbounds float* %tmp18372, i64 1
+  %tmp18374 = getelementptr inbounds float* %tmp18373, i64 1
+  %tmp18375 = getelementptr inbounds float* %tmp18374, i64 1
+  %tmp18376 = getelementptr inbounds float* %tmp18375, i64 1
+  %tmp18377 = getelementptr inbounds float* %tmp18376, i64 1
+  %tmp18378 = getelementptr inbounds float* %tmp18377, i64 1
+  %tmp18379 = getelementptr inbounds float* %tmp18378, i64 1
+  %tmp18380 = getelementptr inbounds float* %tmp18379, i64 1
+  %tmp18381 = getelementptr inbounds float* %tmp18380, i64 1
+  %tmp18382 = getelementptr inbounds float* %tmp18381, i64 1
+  %tmp18383 = getelementptr inbounds float* %tmp18382, i64 1
+  %tmp18384 = getelementptr inbounds float* %tmp18383, i64 1
+  %tmp18385 = getelementptr inbounds float* %tmp18384, i64 1
+  %tmp18386 = getelementptr inbounds float* %tmp18385, i64 1
+  %tmp18387 = getelementptr inbounds float* %tmp18386, i64 1
+  %tmp18388 = getelementptr inbounds float* %tmp18387, i64 1
+  %tmp18389 = getelementptr inbounds float* %tmp18388, i64 1
+  %tmp18390 = getelementptr inbounds float* %tmp18389, i64 1
+  %tmp18391 = getelementptr inbounds float* %tmp18390, i64 1
+  %tmp18392 = getelementptr inbounds float* %tmp18391, i64 1
+  %tmp18393 = getelementptr inbounds float* %tmp18392, i64 1
+  %tmp18394 = getelementptr inbounds float* %tmp18393, i64 1
+  %tmp18395 = getelementptr inbounds float* %tmp18394, i64 1
+  %tmp18396 = getelementptr inbounds float* %tmp18395, i64 1
+  %tmp18397 = getelementptr inbounds float* %tmp18396, i64 1
+  %tmp18398 = getelementptr inbounds float* %tmp18397, i64 1
+  %tmp18399 = getelementptr inbounds float* %tmp18398, i64 1
+  %tmp18400 = getelementptr inbounds float* %tmp18399, i64 1
+  %tmp18401 = getelementptr inbounds float* %tmp18400, i64 1
+  %tmp18402 = getelementptr inbounds float* %tmp18401, i64 1
+  %tmp18403 = getelementptr inbounds float* %tmp18402, i64 1
+  %tmp18404 = getelementptr inbounds float* %tmp18403, i64 1
+  %tmp18405 = getelementptr inbounds float* %tmp18404, i64 1
+  %tmp18406 = getelementptr inbounds float* %tmp18405, i64 1
+  %tmp18407 = getelementptr inbounds float* %tmp18406, i64 1
+  %tmp18408 = getelementptr inbounds float* %tmp18407, i64 1
+  %tmp18409 = getelementptr inbounds float* %tmp18408, i64 1
+  %tmp18410 = getelementptr inbounds float* %tmp18409, i64 1
+  %tmp18411 = getelementptr inbounds float* %tmp18410, i64 1
+  %tmp18412 = getelementptr inbounds float* %tmp18411, i64 1
+  %tmp18413 = getelementptr inbounds float* %tmp18412, i64 1
+  %tmp18414 = getelementptr inbounds float* %tmp18413, i64 1
+  %tmp18415 = getelementptr inbounds float* %tmp18414, i64 1
+  %tmp18416 = getelementptr inbounds float* %tmp18415, i64 1
+  %tmp18417 = getelementptr inbounds float* %tmp18416, i64 1
+  %tmp18418 = getelementptr inbounds float* %tmp18417, i64 1
+  %tmp18419 = getelementptr inbounds float* %tmp18418, i64 1
+  %tmp18420 = getelementptr inbounds float* %tmp18419, i64 1
+  %tmp18421 = getelementptr inbounds float* %tmp18420, i64 1
+  %tmp18422 = getelementptr inbounds float* %tmp18421, i64 1
+  %tmp18423 = getelementptr inbounds float* %tmp18422, i64 1
+  %tmp18424 = getelementptr inbounds float* %tmp18423, i64 1
+  %tmp18425 = getelementptr inbounds float* %tmp18424, i64 1
+  %tmp18426 = getelementptr inbounds float* %tmp18425, i64 1
+  %tmp18427 = getelementptr inbounds float* %tmp18426, i64 1
+  %tmp18428 = getelementptr inbounds float* %tmp18427, i64 1
+  %tmp18429 = getelementptr inbounds float* %tmp18428, i64 1
+  %tmp18430 = getelementptr inbounds float* %tmp18429, i64 1
+  %tmp18431 = getelementptr inbounds float* %tmp18430, i64 1
+  %tmp18432 = getelementptr inbounds float* %tmp18431, i64 1
+  %tmp18433 = getelementptr inbounds float* %tmp18432, i64 1
+  %tmp18434 = getelementptr inbounds float* %tmp18433, i64 1
+  %tmp18435 = getelementptr inbounds float* %tmp18434, i64 1
+  %tmp18436 = getelementptr inbounds float* %tmp18435, i64 1
+  %tmp18437 = getelementptr inbounds float* %tmp18436, i64 1
+  %tmp18438 = getelementptr inbounds float* %tmp18437, i64 1
+  %tmp18439 = getelementptr inbounds float* %tmp18438, i64 1
+  %tmp18440 = getelementptr inbounds float* %tmp18439, i64 1
+  %tmp18441 = getelementptr inbounds float* %tmp18440, i64 1
+  %tmp18442 = getelementptr inbounds float* %tmp18441, i64 1
+  %tmp18443 = getelementptr inbounds float* %tmp18442, i64 1
+  %tmp18444 = getelementptr inbounds float* %tmp18443, i64 1
+  %tmp18445 = getelementptr inbounds float* %tmp18444, i64 1
+  %tmp18446 = getelementptr inbounds float* %tmp18445, i64 1
+  %tmp18447 = getelementptr inbounds float* %tmp18446, i64 1
+  %tmp18448 = getelementptr inbounds float* %tmp18447, i64 1
+  %tmp18449 = getelementptr inbounds float* %tmp18448, i64 1
+  %tmp18450 = getelementptr inbounds float* %tmp18449, i64 1
+  %tmp18451 = getelementptr inbounds float* %tmp18450, i64 1
+  %tmp18452 = getelementptr inbounds float* %tmp18451, i64 1
+  %tmp18453 = getelementptr inbounds float* %tmp18452, i64 1
+  %tmp18454 = getelementptr inbounds float* %tmp18453, i64 1
+  %tmp18455 = getelementptr inbounds float* %tmp18454, i64 1
+  %tmp18456 = getelementptr inbounds float* %tmp18455, i64 1
+  %tmp18457 = getelementptr inbounds float* %tmp18456, i64 1
+  %tmp18458 = getelementptr inbounds float* %tmp18457, i64 1
+  %tmp18459 = getelementptr inbounds float* %tmp18458, i64 1
+  %tmp18460 = getelementptr inbounds float* %tmp18459, i64 1
+  %tmp18461 = getelementptr inbounds float* %tmp18460, i64 1
+  %tmp18462 = getelementptr inbounds float* %tmp18461, i64 1
+  %tmp18463 = getelementptr inbounds float* %tmp18462, i64 1
+  %tmp18464 = getelementptr inbounds float* %tmp18463, i64 1
+  %tmp18465 = getelementptr inbounds float* %tmp18464, i64 1
+  %tmp18466 = getelementptr inbounds float* %tmp18465, i64 1
+  %tmp18467 = getelementptr inbounds float* %tmp18466, i64 1
+  %tmp18468 = getelementptr inbounds float* %tmp18467, i64 1
+  %tmp18469 = getelementptr inbounds float* %tmp18468, i64 1
+  %tmp18470 = getelementptr inbounds float* %tmp18469, i64 1
+  %tmp18471 = getelementptr inbounds float* %tmp18470, i64 1
+  %tmp18472 = getelementptr inbounds float* %tmp18471, i64 1
+  %tmp18473 = getelementptr inbounds float* %tmp18472, i64 1
+  %tmp18474 = getelementptr inbounds float* %tmp18473, i64 1
+  %tmp18475 = getelementptr inbounds float* %tmp18474, i64 1
+  %tmp18476 = getelementptr inbounds float* %tmp18475, i64 1
+  %tmp18477 = getelementptr inbounds float* %tmp18476, i64 1
+  %tmp18478 = getelementptr inbounds float* %tmp18477, i64 1
+  %tmp18479 = getelementptr inbounds float* %tmp18478, i64 1
+  %tmp18480 = getelementptr inbounds float* %tmp18479, i64 1
+  %tmp18481 = getelementptr inbounds float* %tmp18480, i64 1
+  %tmp18482 = getelementptr inbounds float* %tmp18481, i64 1
+  %tmp18483 = getelementptr inbounds float* %tmp18482, i64 1
+  %tmp18484 = getelementptr inbounds float* %tmp18483, i64 1
+  %tmp18485 = getelementptr inbounds float* %tmp18484, i64 1
+  %tmp18486 = getelementptr inbounds float* %tmp18485, i64 1
+  %tmp18487 = getelementptr inbounds float* %tmp18486, i64 1
+  %tmp18488 = getelementptr inbounds float* %tmp18487, i64 1
+  %tmp18489 = getelementptr inbounds float* %tmp18488, i64 1
+  %tmp18490 = getelementptr inbounds float* %tmp18489, i64 1
+  %tmp18491 = getelementptr inbounds float* %tmp18490, i64 1
+  %tmp18492 = getelementptr inbounds float* %tmp18491, i64 1
+  %tmp18493 = getelementptr inbounds float* %tmp18492, i64 1
+  %tmp18494 = getelementptr inbounds float* %tmp18493, i64 1
+  %tmp18495 = getelementptr inbounds float* %tmp18494, i64 1
+  %tmp18496 = getelementptr inbounds float* %tmp18495, i64 1
+  %tmp18497 = getelementptr inbounds float* %tmp18496, i64 1
+  %tmp18498 = getelementptr inbounds float* %tmp18497, i64 1
+  %tmp18499 = getelementptr inbounds float* %tmp18498, i64 1
+  %tmp18500 = getelementptr inbounds float* %tmp18499, i64 1
+  %tmp18501 = getelementptr inbounds float* %tmp18500, i64 1
+  %tmp18502 = getelementptr inbounds float* %tmp18501, i64 1
+  %tmp18503 = getelementptr inbounds float* %tmp18502, i64 1
+  %tmp18504 = getelementptr inbounds float* %tmp18503, i64 1
+  %tmp18505 = getelementptr inbounds float* %tmp18504, i64 1
+  %tmp18506 = getelementptr inbounds float* %tmp18505, i64 1
+  %tmp18507 = getelementptr inbounds float* %tmp18506, i64 1
+  %tmp18508 = getelementptr inbounds float* %tmp18507, i64 1
+  %tmp18509 = getelementptr inbounds float* %tmp18508, i64 1
+  %tmp18510 = getelementptr inbounds float* %tmp18509, i64 1
+  %tmp18511 = getelementptr inbounds float* %tmp18510, i64 1
+  %tmp18512 = getelementptr inbounds float* %tmp18511, i64 1
+  %tmp18513 = getelementptr inbounds float* %tmp18512, i64 1
+  %tmp18514 = getelementptr inbounds float* %tmp18513, i64 1
+  %tmp18515 = getelementptr inbounds float* %tmp18514, i64 1
+  %tmp18516 = getelementptr inbounds float* %tmp18515, i64 1
+  %tmp18517 = getelementptr inbounds float* %tmp18516, i64 1
+  %tmp18518 = getelementptr inbounds float* %tmp18517, i64 1
+  %tmp18519 = getelementptr inbounds float* %tmp18518, i64 1
+  %tmp18520 = getelementptr inbounds float* %tmp18519, i64 1
+  %tmp18521 = getelementptr inbounds float* %tmp18520, i64 1
+  %tmp18522 = getelementptr inbounds float* %tmp18521, i64 1
+  %tmp18523 = getelementptr inbounds float* %tmp18522, i64 1
+  %tmp18524 = getelementptr inbounds float* %tmp18523, i64 1
+  %tmp18525 = getelementptr inbounds float* %tmp18524, i64 1
+  %tmp18526 = getelementptr inbounds float* %tmp18525, i64 1
+  %tmp18527 = getelementptr inbounds float* %tmp18526, i64 1
+  %tmp18528 = getelementptr inbounds float* %tmp18527, i64 1
+  %tmp18529 = getelementptr inbounds float* %tmp18528, i64 1
+  %tmp18530 = getelementptr inbounds float* %tmp18529, i64 1
+  %tmp18531 = getelementptr inbounds float* %tmp18530, i64 1
+  %tmp18532 = getelementptr inbounds float* %tmp18531, i64 1
+  %tmp18533 = getelementptr inbounds float* %tmp18532, i64 1
+  %tmp18534 = getelementptr inbounds float* %tmp18533, i64 1
+  %tmp18535 = getelementptr inbounds float* %tmp18534, i64 1
+  %tmp18536 = getelementptr inbounds float* %tmp18535, i64 1
+  %tmp18537 = getelementptr inbounds float* %tmp18536, i64 1
+  %tmp18538 = getelementptr inbounds float* %tmp18537, i64 1
+  %tmp18539 = getelementptr inbounds float* %tmp18538, i64 1
+  %tmp18540 = getelementptr inbounds float* %tmp18539, i64 1
+  %tmp18541 = getelementptr inbounds float* %tmp18540, i64 1
+  %tmp18542 = getelementptr inbounds float* %tmp18541, i64 1
+  %tmp18543 = getelementptr inbounds float* %tmp18542, i64 1
+  %tmp18544 = getelementptr inbounds float* %tmp18543, i64 1
+  %tmp18545 = getelementptr inbounds float* %tmp18544, i64 1
+  %tmp18546 = getelementptr inbounds float* %tmp18545, i64 1
+  %tmp18547 = getelementptr inbounds float* %tmp18546, i64 1
+  %tmp18548 = getelementptr inbounds float* %tmp18547, i64 1
+  %tmp18549 = getelementptr inbounds float* %tmp18548, i64 1
+  %tmp18550 = getelementptr inbounds float* %tmp18549, i64 1
+  %tmp18551 = getelementptr inbounds float* %tmp18550, i64 1
+  %tmp18552 = getelementptr inbounds float* %tmp18551, i64 1
+  %tmp18553 = getelementptr inbounds float* %tmp18552, i64 1
+  %tmp18554 = getelementptr inbounds float* %tmp18553, i64 1
+  %tmp18555 = getelementptr inbounds float* %tmp18554, i64 1
+  %tmp18556 = getelementptr inbounds float* %tmp18555, i64 1
+  %tmp18557 = getelementptr inbounds float* %tmp18556, i64 1
+  %tmp18558 = getelementptr inbounds float* %tmp18557, i64 1
+  %tmp18559 = getelementptr inbounds float* %tmp18558, i64 1
+  %tmp18560 = getelementptr inbounds float* %tmp18559, i64 1
+  %tmp18561 = getelementptr inbounds float* %tmp18560, i64 1
+  %tmp18562 = getelementptr inbounds float* %tmp18561, i64 1
+  %tmp18563 = getelementptr inbounds float* %tmp18562, i64 1
+  %tmp18564 = getelementptr inbounds float* %tmp18563, i64 1
+  %tmp18565 = getelementptr inbounds float* %tmp18564, i64 1
+  %tmp18566 = getelementptr inbounds float* %tmp18565, i64 1
+  %tmp18567 = getelementptr inbounds float* %tmp18566, i64 1
+  %tmp18568 = getelementptr inbounds float* %tmp18567, i64 1
+  %tmp18569 = getelementptr inbounds float* %tmp18568, i64 1
+  %tmp18570 = getelementptr inbounds float* %tmp18569, i64 1
+  %tmp18571 = getelementptr inbounds float* %tmp18570, i64 1
+  %tmp18572 = getelementptr inbounds float* %tmp18571, i64 1
+  %tmp18573 = getelementptr inbounds float* %tmp18572, i64 1
+  %tmp18574 = getelementptr inbounds float* %tmp18573, i64 1
+  %tmp18575 = getelementptr inbounds float* %tmp18574, i64 1
+  %tmp18576 = getelementptr inbounds float* %tmp18575, i64 1
+  %tmp18577 = getelementptr inbounds float* %tmp18576, i64 1
+  %tmp18578 = getelementptr inbounds float* %tmp18577, i64 1
+  %tmp18579 = getelementptr inbounds float* %tmp18578, i64 1
+  %tmp18580 = getelementptr inbounds float* %tmp18579, i64 1
+  %tmp18581 = getelementptr inbounds float* %tmp18580, i64 1
+  %tmp18582 = getelementptr inbounds float* %tmp18581, i64 1
+  %tmp18583 = getelementptr inbounds float* %tmp18582, i64 1
+  %tmp18584 = getelementptr inbounds float* %tmp18583, i64 1
+  %tmp18585 = getelementptr inbounds float* %tmp18584, i64 1
+  %tmp18586 = getelementptr inbounds float* %tmp18585, i64 1
+  %tmp18587 = getelementptr inbounds float* %tmp18586, i64 1
+  %tmp18588 = getelementptr inbounds float* %tmp18587, i64 1
+  %tmp18589 = getelementptr inbounds float* %tmp18588, i64 1
+  %tmp18590 = getelementptr inbounds float* %tmp18589, i64 1
+  %tmp18591 = getelementptr inbounds float* %tmp18590, i64 1
+  %tmp18592 = getelementptr inbounds float* %tmp18591, i64 1
+  %tmp18593 = getelementptr inbounds float* %tmp18592, i64 1
+  %tmp18594 = getelementptr inbounds float* %tmp18593, i64 1
+  %tmp18595 = getelementptr inbounds float* %tmp18594, i64 1
+  %tmp18596 = getelementptr inbounds float* %tmp18595, i64 1
+  %tmp18597 = getelementptr inbounds float* %tmp18596, i64 1
+  %tmp18598 = getelementptr inbounds float* %tmp18597, i64 1
+  %tmp18599 = getelementptr inbounds float* %tmp18598, i64 1
+  %tmp18600 = getelementptr inbounds float* %tmp18599, i64 1
+  %tmp18601 = getelementptr inbounds float* %tmp18600, i64 1
+  %tmp18602 = getelementptr inbounds float* %tmp18601, i64 1
+  %tmp18603 = getelementptr inbounds float* %tmp18602, i64 1
+  %tmp18604 = getelementptr inbounds float* %tmp18603, i64 1
+  %tmp18605 = getelementptr inbounds float* %tmp18604, i64 1
+  %tmp18606 = getelementptr inbounds float* %tmp18605, i64 1
+  %tmp18607 = getelementptr inbounds float* %tmp18606, i64 1
+  %tmp18608 = getelementptr inbounds float* %tmp18607, i64 1
+  %tmp18609 = getelementptr inbounds float* %tmp18608, i64 1
+  %tmp18610 = getelementptr inbounds float* %tmp18609, i64 1
+  %tmp18611 = getelementptr inbounds float* %tmp18610, i64 1
+  %tmp18612 = getelementptr inbounds float* %tmp18611, i64 1
+  %tmp18613 = getelementptr inbounds float* %tmp18612, i64 1
+  %tmp18614 = getelementptr inbounds float* %tmp18613, i64 1
+  %tmp18615 = getelementptr inbounds float* %tmp18614, i64 1
+  %tmp18616 = getelementptr inbounds float* %tmp18615, i64 1
+  %tmp18617 = getelementptr inbounds float* %tmp18616, i64 1
+  %tmp18618 = getelementptr inbounds float* %tmp18617, i64 1
+  %tmp18619 = getelementptr inbounds float* %tmp18618, i64 1
+  %tmp18620 = getelementptr inbounds float* %tmp18619, i64 1
+  %tmp18621 = getelementptr inbounds float* %tmp18620, i64 1
+  %tmp18622 = getelementptr inbounds float* %tmp18621, i64 1
+  %tmp18623 = getelementptr inbounds float* %tmp18622, i64 1
+  %tmp18624 = getelementptr inbounds float* %tmp18623, i64 1
+  %tmp18625 = getelementptr inbounds float* %tmp18624, i64 1
+  %tmp18626 = getelementptr inbounds float* %tmp18625, i64 1
+  %tmp18627 = getelementptr inbounds float* %tmp18626, i64 1
+  %tmp18628 = getelementptr inbounds float* %tmp18627, i64 1
+  %tmp18629 = getelementptr inbounds float* %tmp18628, i64 1
+  %tmp18630 = getelementptr inbounds float* %tmp18629, i64 1
+  %tmp18631 = getelementptr inbounds float* %tmp18630, i64 1
+  %tmp18632 = getelementptr inbounds float* %tmp18631, i64 1
+  %tmp18633 = getelementptr inbounds float* %tmp18632, i64 1
+  %tmp18634 = getelementptr inbounds float* %tmp18633, i64 1
+  %tmp18635 = getelementptr inbounds float* %tmp18634, i64 1
+  %tmp18636 = getelementptr inbounds float* %tmp18635, i64 1
+  %tmp18637 = getelementptr inbounds float* %tmp18636, i64 1
+  %tmp18638 = getelementptr inbounds float* %tmp18637, i64 1
+  %tmp18639 = getelementptr inbounds float* %tmp18638, i64 1
+  %tmp18640 = getelementptr inbounds float* %tmp18639, i64 1
+  %tmp18641 = getelementptr inbounds float* %tmp18640, i64 1
+  %tmp18642 = getelementptr inbounds float* %tmp18641, i64 1
+  %tmp18643 = getelementptr inbounds float* %tmp18642, i64 1
+  %tmp18644 = getelementptr inbounds float* %tmp18643, i64 1
+  %tmp18645 = getelementptr inbounds float* %tmp18644, i64 1
+  %tmp18646 = getelementptr inbounds float* %tmp18645, i64 1
+  %tmp18647 = getelementptr inbounds float* %tmp18646, i64 1
+  %tmp18648 = getelementptr inbounds float* %tmp18647, i64 1
+  %tmp18649 = getelementptr inbounds float* %tmp18648, i64 1
+  %tmp18650 = getelementptr inbounds float* %tmp18649, i64 1
+  %tmp18651 = getelementptr inbounds float* %tmp18650, i64 1
+  %tmp18652 = getelementptr inbounds float* %tmp18651, i64 1
+  %tmp18653 = getelementptr inbounds float* %tmp18652, i64 1
+  %tmp18654 = getelementptr inbounds float* %tmp18653, i64 1
+  %tmp18655 = getelementptr inbounds float* %tmp18654, i64 1
+  %tmp18656 = getelementptr inbounds float* %tmp18655, i64 1
+  %tmp18657 = getelementptr inbounds float* %tmp18656, i64 1
+  %tmp18658 = getelementptr inbounds float* %tmp18657, i64 1
+  %tmp18659 = getelementptr inbounds float* %tmp18658, i64 1
+  %tmp18660 = getelementptr inbounds float* %tmp18659, i64 1
+  %tmp18661 = getelementptr inbounds float* %tmp18660, i64 1
+  %tmp18662 = getelementptr inbounds float* %tmp18661, i64 1
+  %tmp18663 = getelementptr inbounds float* %tmp18662, i64 1
+  %tmp18664 = getelementptr inbounds float* %tmp18663, i64 1
+  %tmp18665 = getelementptr inbounds float* %tmp18664, i64 1
+  %tmp18666 = getelementptr inbounds float* %tmp18665, i64 1
+  %tmp18667 = getelementptr inbounds float* %tmp18666, i64 1
+  %tmp18668 = getelementptr inbounds float* %tmp18667, i64 1
+  %tmp18669 = getelementptr inbounds float* %tmp18668, i64 1
+  %tmp18670 = getelementptr inbounds float* %tmp18669, i64 1
+  %tmp18671 = getelementptr inbounds float* %tmp18670, i64 1
+  %tmp18672 = getelementptr inbounds float* %tmp18671, i64 1
+  %tmp18673 = getelementptr inbounds float* %tmp18672, i64 1
+  %tmp18674 = getelementptr inbounds float* %tmp18673, i64 1
+  %tmp18675 = getelementptr inbounds float* %tmp18674, i64 1
+  %tmp18676 = getelementptr inbounds float* %tmp18675, i64 1
+  %tmp18677 = getelementptr inbounds float* %tmp18676, i64 1
+  %tmp18678 = getelementptr inbounds float* %tmp18677, i64 1
+  %tmp18679 = getelementptr inbounds float* %tmp18678, i64 1
+  %tmp18680 = getelementptr inbounds float* %tmp18679, i64 1
+  %tmp18681 = getelementptr inbounds float* %tmp18680, i64 1
+  %tmp18682 = getelementptr inbounds float* %tmp18681, i64 1
+  %tmp18683 = getelementptr inbounds float* %tmp18682, i64 1
+  %tmp18684 = getelementptr inbounds float* %tmp18683, i64 1
+  %tmp18685 = getelementptr inbounds float* %tmp18684, i64 1
+  %tmp18686 = getelementptr inbounds float* %tmp18685, i64 1
+  %tmp18687 = getelementptr inbounds float* %tmp18686, i64 1
+  %tmp18688 = getelementptr inbounds float* %tmp18687, i64 1
+  %tmp18689 = getelementptr inbounds float* %tmp18688, i64 1
+  %tmp18690 = getelementptr inbounds float* %tmp18689, i64 1
+  %tmp18691 = getelementptr inbounds float* %tmp18690, i64 1
+  %tmp18692 = getelementptr inbounds float* %tmp18691, i64 1
+  %tmp18693 = getelementptr inbounds float* %tmp18692, i64 1
+  %tmp18694 = getelementptr inbounds float* %tmp18693, i64 1
+  %tmp18695 = getelementptr inbounds float* %tmp18694, i64 1
+  %tmp18696 = getelementptr inbounds float* %tmp18695, i64 1
+  %tmp18697 = getelementptr inbounds float* %tmp18696, i64 1
+  %tmp18698 = getelementptr inbounds float* %tmp18697, i64 1
+  %tmp18699 = getelementptr inbounds float* %tmp18698, i64 1
+  %tmp18700 = getelementptr inbounds float* %tmp18699, i64 1
+  %tmp18701 = getelementptr inbounds float* %tmp18700, i64 1
+  %tmp18702 = getelementptr inbounds float* %tmp18701, i64 1
+  %tmp18703 = getelementptr inbounds float* %tmp18702, i64 1
+  %tmp18704 = getelementptr inbounds float* %tmp18703, i64 1
+  %tmp18705 = getelementptr inbounds float* %tmp18704, i64 1
+  %tmp18706 = getelementptr inbounds float* %tmp18705, i64 1
+  %tmp18707 = getelementptr inbounds float* %tmp18706, i64 1
+  %tmp18708 = getelementptr inbounds float* %tmp18707, i64 1
+  %tmp18709 = getelementptr inbounds float* %tmp18708, i64 1
+  %tmp18710 = getelementptr inbounds float* %tmp18709, i64 1
+  %tmp18711 = getelementptr inbounds float* %tmp18710, i64 1
+  %tmp18712 = getelementptr inbounds float* %tmp18711, i64 1
+  %tmp18713 = getelementptr inbounds float* %tmp18712, i64 1
+  %tmp18714 = getelementptr inbounds float* %tmp18713, i64 1
+  %tmp18715 = getelementptr inbounds float* %tmp18714, i64 1
+  %tmp18716 = getelementptr inbounds float* %tmp18715, i64 1
+  %tmp18717 = getelementptr inbounds float* %tmp18716, i64 1
+  %tmp18718 = getelementptr inbounds float* %tmp18717, i64 1
+  %tmp18719 = getelementptr inbounds float* %tmp18718, i64 1
+  %tmp18720 = getelementptr inbounds float* %tmp18719, i64 1
+  %tmp18721 = getelementptr inbounds float* %tmp18720, i64 1
+  %tmp18722 = getelementptr inbounds float* %tmp18721, i64 1
+  %tmp18723 = getelementptr inbounds float* %tmp18722, i64 1
+  %tmp18724 = getelementptr inbounds float* %tmp18723, i64 1
+  %tmp18725 = getelementptr inbounds float* %tmp18724, i64 1
+  %tmp18726 = getelementptr inbounds float* %tmp18725, i64 1
+  %tmp18727 = getelementptr inbounds float* %tmp18726, i64 1
+  %tmp18728 = getelementptr inbounds float* %tmp18727, i64 1
+  %tmp18729 = getelementptr inbounds float* %tmp18728, i64 1
+  %tmp18730 = getelementptr inbounds float* %tmp18729, i64 1
+  %tmp18731 = getelementptr inbounds float* %tmp18730, i64 1
+  %tmp18732 = getelementptr inbounds float* %tmp18731, i64 1
+  %tmp18733 = getelementptr inbounds float* %tmp18732, i64 1
+  %tmp18734 = getelementptr inbounds float* %tmp18733, i64 1
+  %tmp18735 = getelementptr inbounds float* %tmp18734, i64 1
+  %tmp18736 = getelementptr inbounds float* %tmp18735, i64 1
+  %tmp18737 = getelementptr inbounds float* %tmp18736, i64 1
+  %tmp18738 = getelementptr inbounds float* %tmp18737, i64 1
+  %tmp18739 = getelementptr inbounds float* %tmp18738, i64 1
+  %tmp18740 = getelementptr inbounds float* %tmp18739, i64 1
+  %tmp18741 = getelementptr inbounds float* %tmp18740, i64 1
+  %tmp18742 = getelementptr inbounds float* %tmp18741, i64 1
+  %tmp18743 = getelementptr inbounds float* %tmp18742, i64 1
+  %tmp18744 = getelementptr inbounds float* %tmp18743, i64 1
+  %tmp18745 = getelementptr inbounds float* %tmp18744, i64 1
+  %tmp18746 = getelementptr inbounds float* %tmp18745, i64 1
+  %tmp18747 = getelementptr inbounds float* %tmp18746, i64 1
+  %tmp18748 = getelementptr inbounds float* %tmp18747, i64 1
+  %tmp18749 = getelementptr inbounds float* %tmp18748, i64 1
+  %tmp18750 = getelementptr inbounds float* %tmp18749, i64 1
+  %tmp18751 = getelementptr inbounds float* %tmp18750, i64 1
+  %tmp18752 = getelementptr inbounds float* %tmp18751, i64 1
+  %tmp18753 = getelementptr inbounds float* %tmp18752, i64 1
+  %tmp18754 = getelementptr inbounds float* %tmp18753, i64 1
+  %tmp18755 = getelementptr inbounds float* %tmp18754, i64 1
+  %tmp18756 = getelementptr inbounds float* %tmp18755, i64 1
+  %tmp18757 = getelementptr inbounds float* %tmp18756, i64 1
+  %tmp18758 = getelementptr inbounds float* %tmp18757, i64 1
+  %tmp18759 = getelementptr inbounds float* %tmp18758, i64 1
+  %tmp18760 = getelementptr inbounds float* %tmp18759, i64 1
+  %tmp18761 = getelementptr inbounds float* %tmp18760, i64 1
+  %tmp18762 = getelementptr inbounds float* %tmp18761, i64 1
+  %tmp18763 = getelementptr inbounds float* %tmp18762, i64 1
+  %tmp18764 = getelementptr inbounds float* %tmp18763, i64 1
+  %tmp18765 = getelementptr inbounds float* %tmp18764, i64 1
+  %tmp18766 = getelementptr inbounds float* %tmp18765, i64 1
+  %tmp18767 = getelementptr inbounds float* %tmp18766, i64 1
+  %tmp18768 = getelementptr inbounds float* %tmp18767, i64 1
+  %tmp18769 = getelementptr inbounds float* %tmp18768, i64 1
+  %tmp18770 = getelementptr inbounds float* %tmp18769, i64 1
+  %tmp18771 = getelementptr inbounds float* %tmp18770, i64 1
+  %tmp18772 = getelementptr inbounds float* %tmp18771, i64 1
+  %tmp18773 = getelementptr inbounds float* %tmp18772, i64 1
+  %tmp18774 = getelementptr inbounds float* %tmp18773, i64 1
+  %tmp18775 = getelementptr inbounds float* %tmp18774, i64 1
+  %tmp18776 = getelementptr inbounds float* %tmp18775, i64 1
+  %tmp18777 = getelementptr inbounds float* %tmp18776, i64 1
+  %tmp18778 = getelementptr inbounds float* %tmp18777, i64 1
+  %tmp18779 = getelementptr inbounds float* %tmp18778, i64 1
+  %tmp18780 = getelementptr inbounds float* %tmp18779, i64 1
+  %tmp18781 = getelementptr inbounds float* %tmp18780, i64 1
+  %tmp18782 = getelementptr inbounds float* %tmp18781, i64 1
+  %tmp18783 = getelementptr inbounds float* %tmp18782, i64 1
+  %tmp18784 = getelementptr inbounds float* %tmp18783, i64 1
+  %tmp18785 = getelementptr inbounds float* %tmp18784, i64 1
+  %tmp18786 = getelementptr inbounds float* %tmp18785, i64 1
+  %tmp18787 = getelementptr inbounds float* %tmp18786, i64 1
+  %tmp18788 = getelementptr inbounds float* %tmp18787, i64 1
+  %tmp18789 = getelementptr inbounds float* %tmp18788, i64 1
+  %tmp18790 = getelementptr inbounds float* %tmp18789, i64 1
+  %tmp18791 = getelementptr inbounds float* %tmp18790, i64 1
+  %tmp18792 = getelementptr inbounds float* %tmp18791, i64 1
+  %tmp18793 = getelementptr inbounds float* %tmp18792, i64 1
+  %tmp18794 = getelementptr inbounds float* %tmp18793, i64 1
+  %tmp18795 = getelementptr inbounds float* %tmp18794, i64 1
+  %tmp18796 = getelementptr inbounds float* %tmp18795, i64 1
+  %tmp18797 = getelementptr inbounds float* %tmp18796, i64 1
+  %tmp18798 = getelementptr inbounds float* %tmp18797, i64 1
+  %tmp18799 = getelementptr inbounds float* %tmp18798, i64 1
+  %tmp18800 = getelementptr inbounds float* %tmp18799, i64 1
+  %tmp18801 = getelementptr inbounds float* %tmp18800, i64 1
+  %tmp18802 = getelementptr inbounds float* %tmp18801, i64 1
+  %tmp18803 = getelementptr inbounds float* %tmp18802, i64 1
+  %tmp18804 = getelementptr inbounds float* %tmp18803, i64 1
+  %tmp18805 = getelementptr inbounds float* %tmp18804, i64 1
+  %tmp18806 = getelementptr inbounds float* %tmp18805, i64 1
+  %tmp18807 = getelementptr inbounds float* %tmp18806, i64 1
+  %tmp18808 = getelementptr inbounds float* %tmp18807, i64 1
+  %tmp18809 = getelementptr inbounds float* %tmp18808, i64 1
+  %tmp18810 = getelementptr inbounds float* %tmp18809, i64 1
+  %tmp18811 = getelementptr inbounds float* %tmp18810, i64 1
+  %tmp18812 = getelementptr inbounds float* %tmp18811, i64 1
+  %tmp18813 = getelementptr inbounds float* %tmp18812, i64 1
+  %tmp18814 = getelementptr inbounds float* %tmp18813, i64 1
+  %tmp18815 = getelementptr inbounds float* %tmp18814, i64 1
+  %tmp18816 = getelementptr inbounds float* %tmp18815, i64 1
+  %tmp18817 = getelementptr inbounds float* %tmp18816, i64 1
+  %tmp18818 = getelementptr inbounds float* %tmp18817, i64 1
+  %tmp18819 = getelementptr inbounds float* %tmp18818, i64 1
+  %tmp18820 = getelementptr inbounds float* %tmp18819, i64 1
+  %tmp18821 = getelementptr inbounds float* %tmp18820, i64 1
+  %tmp18822 = getelementptr inbounds float* %tmp18821, i64 1
+  %tmp18823 = getelementptr inbounds float* %tmp18822, i64 1
+  %tmp18824 = getelementptr inbounds float* %tmp18823, i64 1
+  %tmp18825 = getelementptr inbounds float* %tmp18824, i64 1
+  %tmp18826 = getelementptr inbounds float* %tmp18825, i64 1
+  %tmp18827 = getelementptr inbounds float* %tmp18826, i64 1
+  %tmp18828 = getelementptr inbounds float* %tmp18827, i64 1
+  %tmp18829 = getelementptr inbounds float* %tmp18828, i64 1
+  %tmp18830 = getelementptr inbounds float* %tmp18829, i64 1
+  %tmp18831 = getelementptr inbounds float* %tmp18830, i64 1
+  %tmp18832 = getelementptr inbounds float* %tmp18831, i64 1
+  %tmp18833 = getelementptr inbounds float* %tmp18832, i64 1
+  %tmp18834 = getelementptr inbounds float* %tmp18833, i64 1
+  %tmp18835 = getelementptr inbounds float* %tmp18834, i64 1
+  %tmp18836 = getelementptr inbounds float* %tmp18835, i64 1
+  %tmp18837 = getelementptr inbounds float* %tmp18836, i64 1
+  %tmp18838 = getelementptr inbounds float* %tmp18837, i64 1
+  %tmp18839 = getelementptr inbounds float* %tmp18838, i64 1
+  %tmp18840 = getelementptr inbounds float* %tmp18839, i64 1
+  %tmp18841 = getelementptr inbounds float* %tmp18840, i64 1
+  %tmp18842 = getelementptr inbounds float* %tmp18841, i64 1
+  %tmp18843 = getelementptr inbounds float* %tmp18842, i64 1
+  %tmp18844 = getelementptr inbounds float* %tmp18843, i64 1
+  %tmp18845 = getelementptr inbounds float* %tmp18844, i64 1
+  %tmp18846 = getelementptr inbounds float* %tmp18845, i64 1
+  %tmp18847 = getelementptr inbounds float* %tmp18846, i64 1
+  %tmp18848 = getelementptr inbounds float* %tmp18847, i64 1
+  %tmp18849 = getelementptr inbounds float* %tmp18848, i64 1
+  %tmp18850 = getelementptr inbounds float* %tmp18849, i64 1
+  %tmp18851 = getelementptr inbounds float* %tmp18850, i64 1
+  %tmp18852 = getelementptr inbounds float* %tmp18851, i64 1
+  %tmp18853 = getelementptr inbounds float* %tmp18852, i64 1
+  %tmp18854 = getelementptr inbounds float* %tmp18853, i64 1
+  %tmp18855 = getelementptr inbounds float* %tmp18854, i64 1
+  %tmp18856 = getelementptr inbounds float* %tmp18855, i64 1
+  %tmp18857 = getelementptr inbounds float* %tmp18856, i64 1
+  %tmp18858 = getelementptr inbounds float* %tmp18857, i64 1
+  %tmp18859 = getelementptr inbounds float* %tmp18858, i64 1
+  %tmp18860 = getelementptr inbounds float* %tmp18859, i64 1
+  %tmp18861 = getelementptr inbounds float* %tmp18860, i64 1
+  %tmp18862 = getelementptr inbounds float* %tmp18861, i64 1
+  %tmp18863 = getelementptr inbounds float* %tmp18862, i64 1
+  %tmp18864 = getelementptr inbounds float* %tmp18863, i64 1
+  %tmp18865 = getelementptr inbounds float* %tmp18864, i64 1
+  %tmp18866 = getelementptr inbounds float* %tmp18865, i64 1
+  %tmp18867 = getelementptr inbounds float* %tmp18866, i64 1
+  %tmp18868 = getelementptr inbounds float* %tmp18867, i64 1
+  %tmp18869 = getelementptr inbounds float* %tmp18868, i64 1
+  %tmp18870 = getelementptr inbounds float* %tmp18869, i64 1
+  %tmp18871 = getelementptr inbounds float* %tmp18870, i64 1
+  %tmp18872 = getelementptr inbounds float* %tmp18871, i64 1
+  %tmp18873 = getelementptr inbounds float* %tmp18872, i64 1
+  %tmp18874 = getelementptr inbounds float* %tmp18873, i64 1
+  %tmp18875 = getelementptr inbounds float* %tmp18874, i64 1
+  %tmp18876 = getelementptr inbounds float* %tmp18875, i64 1
+  %tmp18877 = getelementptr inbounds float* %tmp18876, i64 1
+  %tmp18878 = getelementptr inbounds float* %tmp18877, i64 1
+  %tmp18879 = getelementptr inbounds float* %tmp18878, i64 1
+  %tmp18880 = getelementptr inbounds float* %tmp18879, i64 1
+  %tmp18881 = getelementptr inbounds float* %tmp18880, i64 1
+  %tmp18882 = getelementptr inbounds float* %tmp18881, i64 1
+  %tmp18883 = getelementptr inbounds float* %tmp18882, i64 1
+  %tmp18884 = getelementptr inbounds float* %tmp18883, i64 1
+  %tmp18885 = getelementptr inbounds float* %tmp18884, i64 1
+  %tmp18886 = getelementptr inbounds float* %tmp18885, i64 1
+  %tmp18887 = getelementptr inbounds float* %tmp18886, i64 1
+  %tmp18888 = getelementptr inbounds float* %tmp18887, i64 1
+  %tmp18889 = getelementptr inbounds float* %tmp18888, i64 1
+  %tmp18890 = getelementptr inbounds float* %tmp18889, i64 1
+  %tmp18891 = getelementptr inbounds float* %tmp18890, i64 1
+  %tmp18892 = getelementptr inbounds float* %tmp18891, i64 1
+  %tmp18893 = getelementptr inbounds float* %tmp18892, i64 1
+  %tmp18894 = getelementptr inbounds float* %tmp18893, i64 1
+  %tmp18895 = getelementptr inbounds float* %tmp18894, i64 1
+  %tmp18896 = getelementptr inbounds float* %tmp18895, i64 1
+  %tmp18897 = getelementptr inbounds float* %tmp18896, i64 1
+  %tmp18898 = getelementptr inbounds float* %tmp18897, i64 1
+  %tmp18899 = getelementptr inbounds float* %tmp18898, i64 1
+  %tmp18900 = getelementptr inbounds float* %tmp18899, i64 1
+  %tmp18901 = getelementptr inbounds float* %tmp18900, i64 1
+  %tmp18902 = getelementptr inbounds float* %tmp18901, i64 1
+  %tmp18903 = getelementptr inbounds float* %tmp18902, i64 1
+  %tmp18904 = getelementptr inbounds float* %tmp18903, i64 1
+  %tmp18905 = getelementptr inbounds float* %tmp18904, i64 1
+  %tmp18906 = getelementptr inbounds float* %tmp18905, i64 1
+  %tmp18907 = getelementptr inbounds float* %tmp18906, i64 1
+  %tmp18908 = getelementptr inbounds float* %tmp18907, i64 1
+  %tmp18909 = getelementptr inbounds float* %tmp18908, i64 1
+  %tmp18910 = getelementptr inbounds float* %tmp18909, i64 1
+  %tmp18911 = getelementptr inbounds float* %tmp18910, i64 1
+  %tmp18912 = getelementptr inbounds float* %tmp18911, i64 1
+  %tmp18913 = getelementptr inbounds float* %tmp18912, i64 1
+  %tmp18914 = getelementptr inbounds float* %tmp18913, i64 1
+  %tmp18915 = getelementptr inbounds float* %tmp18914, i64 1
+  %tmp18916 = getelementptr inbounds float* %tmp18915, i64 1
+  %tmp18917 = getelementptr inbounds float* %tmp18916, i64 1
+  %tmp18918 = getelementptr inbounds float* %tmp18917, i64 1
+  %tmp18919 = getelementptr inbounds float* %tmp18918, i64 1
+  %tmp18920 = getelementptr inbounds float* %tmp18919, i64 1
+  %tmp18921 = getelementptr inbounds float* %tmp18920, i64 1
+  %tmp18922 = getelementptr inbounds float* %tmp18921, i64 1
+  %tmp18923 = getelementptr inbounds float* %tmp18922, i64 1
+  %tmp18924 = getelementptr inbounds float* %tmp18923, i64 1
+  %tmp18925 = getelementptr inbounds float* %tmp18924, i64 1
+  %tmp18926 = getelementptr inbounds float* %tmp18925, i64 1
+  %tmp18927 = getelementptr inbounds float* %tmp18926, i64 1
+  %tmp18928 = getelementptr inbounds float* %tmp18927, i64 1
+  %tmp18929 = getelementptr inbounds float* %tmp18928, i64 1
+  %tmp18930 = getelementptr inbounds float* %tmp18929, i64 1
+  %tmp18931 = getelementptr inbounds float* %tmp18930, i64 1
+  %tmp18932 = getelementptr inbounds float* %tmp18931, i64 1
+  %tmp18933 = getelementptr inbounds float* %tmp18932, i64 1
+  %tmp18934 = getelementptr inbounds float* %tmp18933, i64 1
+  %tmp18935 = getelementptr inbounds float* %tmp18934, i64 1
+  %tmp18936 = getelementptr inbounds float* %tmp18935, i64 1
+  %tmp18937 = getelementptr inbounds float* %tmp18936, i64 1
+  %tmp18938 = getelementptr inbounds float* %tmp18937, i64 1
+  %tmp18939 = getelementptr inbounds float* %tmp18938, i64 1
+  %tmp18940 = getelementptr inbounds float* %tmp18939, i64 1
+  %tmp18941 = getelementptr inbounds float* %tmp18940, i64 1
+  %tmp18942 = getelementptr inbounds float* %tmp18941, i64 1
+  %tmp18943 = getelementptr inbounds float* %tmp18942, i64 1
+  %tmp18944 = getelementptr inbounds float* %tmp18943, i64 1
+  %tmp18945 = getelementptr inbounds float* %tmp18944, i64 1
+  %tmp18946 = getelementptr inbounds float* %tmp18945, i64 1
+  %tmp18947 = getelementptr inbounds float* %tmp18946, i64 1
+  %tmp18948 = getelementptr inbounds float* %tmp18947, i64 1
+  %tmp18949 = getelementptr inbounds float* %tmp18948, i64 1
+  %tmp18950 = getelementptr inbounds float* %tmp18949, i64 1
+  %tmp18951 = getelementptr inbounds float* %tmp18950, i64 1
+  %tmp18952 = getelementptr inbounds float* %tmp18951, i64 1
+  %tmp18953 = getelementptr inbounds float* %tmp18952, i64 1
+  %tmp18954 = getelementptr inbounds float* %tmp18953, i64 1
+  %tmp18955 = getelementptr inbounds float* %tmp18954, i64 1
+  %tmp18956 = getelementptr inbounds float* %tmp18955, i64 1
+  %tmp18957 = getelementptr inbounds float* %tmp18956, i64 1
+  %tmp18958 = getelementptr inbounds float* %tmp18957, i64 1
+  %tmp18959 = getelementptr inbounds float* %tmp18958, i64 1
+  %tmp18960 = getelementptr inbounds float* %tmp18959, i64 1
+  %tmp18961 = getelementptr inbounds float* %tmp18960, i64 1
+  %tmp18962 = getelementptr inbounds float* %tmp18961, i64 1
+  %tmp18963 = getelementptr inbounds float* %tmp18962, i64 1
+  %tmp18964 = getelementptr inbounds float* %tmp18963, i64 1
+  %tmp18965 = getelementptr inbounds float* %tmp18964, i64 1
+  %tmp18966 = getelementptr inbounds float* %tmp18965, i64 1
+  %tmp18967 = getelementptr inbounds float* %tmp18966, i64 1
+  %tmp18968 = getelementptr inbounds float* %tmp18967, i64 1
+  %tmp18969 = getelementptr inbounds float* %tmp18968, i64 1
+  %tmp18970 = getelementptr inbounds float* %tmp18969, i64 1
+  %tmp18971 = getelementptr inbounds float* %tmp18970, i64 1
+  %tmp18972 = getelementptr inbounds float* %tmp18971, i64 1
+  %tmp18973 = getelementptr inbounds float* %tmp18972, i64 1
+  %tmp18974 = getelementptr inbounds float* %tmp18973, i64 1
+  %tmp18975 = getelementptr inbounds float* %tmp18974, i64 1
+  %tmp18976 = getelementptr inbounds float* %tmp18975, i64 1
+  %tmp18977 = getelementptr inbounds float* %tmp18976, i64 1
+  %tmp18978 = getelementptr inbounds float* %tmp18977, i64 1
+  %tmp18979 = getelementptr inbounds float* %tmp18978, i64 1
+  %tmp18980 = getelementptr inbounds float* %tmp18979, i64 1
+  %tmp18981 = getelementptr inbounds float* %tmp18980, i64 1
+  %tmp18982 = getelementptr inbounds float* %tmp18981, i64 1
+  %tmp18983 = getelementptr inbounds float* %tmp18982, i64 1
+  %tmp18984 = getelementptr inbounds float* %tmp18983, i64 1
+  %tmp18985 = getelementptr inbounds float* %tmp18984, i64 1
+  %tmp18986 = getelementptr inbounds float* %tmp18985, i64 1
+  %tmp18987 = getelementptr inbounds float* %tmp18986, i64 1
+  %tmp18988 = getelementptr inbounds float* %tmp18987, i64 1
+  %tmp18989 = getelementptr inbounds float* %tmp18988, i64 1
+  %tmp18990 = getelementptr inbounds float* %tmp18989, i64 1
+  %tmp18991 = getelementptr inbounds float* %tmp18990, i64 1
+  %tmp18992 = getelementptr inbounds float* %tmp18991, i64 1
+  %tmp18993 = getelementptr inbounds float* %tmp18992, i64 1
+  %tmp18994 = getelementptr inbounds float* %tmp18993, i64 1
+  %tmp18995 = getelementptr inbounds float* %tmp18994, i64 1
+  %tmp18996 = getelementptr inbounds float* %tmp18995, i64 1
+  %tmp18997 = getelementptr inbounds float* %tmp18996, i64 1
+  %tmp18998 = getelementptr inbounds float* %tmp18997, i64 1
+  %tmp18999 = getelementptr inbounds float* %tmp18998, i64 1
+  %tmp19000 = getelementptr inbounds float* %tmp18999, i64 1
+  %tmp19001 = getelementptr inbounds float* %tmp19000, i64 1
+  %tmp19002 = getelementptr inbounds float* %tmp19001, i64 1
+  %tmp19003 = getelementptr inbounds float* %tmp19002, i64 1
+  %tmp19004 = getelementptr inbounds float* %tmp19003, i64 1
+  %tmp19005 = getelementptr inbounds float* %tmp19004, i64 1
+  %tmp19006 = getelementptr inbounds float* %tmp19005, i64 1
+  %tmp19007 = getelementptr inbounds float* %tmp19006, i64 1
+  %tmp19008 = getelementptr inbounds float* %tmp19007, i64 1
+  %tmp19009 = getelementptr inbounds float* %tmp19008, i64 1
+  %tmp19010 = getelementptr inbounds float* %tmp19009, i64 1
+  %tmp19011 = getelementptr inbounds float* %tmp19010, i64 1
+  %tmp19012 = getelementptr inbounds float* %tmp19011, i64 1
+  %tmp19013 = getelementptr inbounds float* %tmp19012, i64 1
+  %tmp19014 = getelementptr inbounds float* %tmp19013, i64 1
+  %tmp19015 = getelementptr inbounds float* %tmp19014, i64 1
+  %tmp19016 = getelementptr inbounds float* %tmp19015, i64 1
+  %tmp19017 = getelementptr inbounds float* %tmp19016, i64 1
+  %tmp19018 = getelementptr inbounds float* %tmp19017, i64 1
+  %tmp19019 = getelementptr inbounds float* %tmp19018, i64 1
+  %tmp19020 = getelementptr inbounds float* %tmp19019, i64 1
+  %tmp19021 = getelementptr inbounds float* %tmp19020, i64 1
+  %tmp19022 = getelementptr inbounds float* %tmp19021, i64 1
+  %tmp19023 = getelementptr inbounds float* %tmp19022, i64 1
+  %tmp19024 = getelementptr inbounds float* %tmp19023, i64 1
+  %tmp19025 = getelementptr inbounds float* %tmp19024, i64 1
+  %tmp19026 = getelementptr inbounds float* %tmp19025, i64 1
+  %tmp19027 = getelementptr inbounds float* %tmp19026, i64 1
+  %tmp19028 = getelementptr inbounds float* %tmp19027, i64 1
+  %tmp19029 = getelementptr inbounds float* %tmp19028, i64 1
+  %tmp19030 = getelementptr inbounds float* %tmp19029, i64 1
+  %tmp19031 = getelementptr inbounds float* %tmp19030, i64 1
+  %tmp19032 = getelementptr inbounds float* %tmp19031, i64 1
+  %tmp19033 = getelementptr inbounds float* %tmp19032, i64 1
+  %tmp19034 = getelementptr inbounds float* %tmp19033, i64 1
+  %tmp19035 = getelementptr inbounds float* %tmp19034, i64 1
+  %tmp19036 = getelementptr inbounds float* %tmp19035, i64 1
+  %tmp19037 = getelementptr inbounds float* %tmp19036, i64 1
+  %tmp19038 = getelementptr inbounds float* %tmp19037, i64 1
+  %tmp19039 = getelementptr inbounds float* %tmp19038, i64 1
+  %tmp19040 = getelementptr inbounds float* %tmp19039, i64 1
+  %tmp19041 = getelementptr inbounds float* %tmp19040, i64 1
+  %tmp19042 = getelementptr inbounds float* %tmp19041, i64 1
+  %tmp19043 = getelementptr inbounds float* %tmp19042, i64 1
+  %tmp19044 = getelementptr inbounds float* %tmp19043, i64 1
+  %tmp19045 = getelementptr inbounds float* %tmp19044, i64 1
+  %tmp19046 = getelementptr inbounds float* %tmp19045, i64 1
+  %tmp19047 = getelementptr inbounds float* %tmp19046, i64 1
+  %tmp19048 = getelementptr inbounds float* %tmp19047, i64 1
+  %tmp19049 = getelementptr inbounds float* %tmp19048, i64 1
+  %tmp19050 = getelementptr inbounds float* %tmp19049, i64 1
+  %tmp19051 = getelementptr inbounds float* %tmp19050, i64 1
+  %tmp19052 = getelementptr inbounds float* %tmp19051, i64 1
+  %tmp19053 = getelementptr inbounds float* %tmp19052, i64 1
+  %tmp19054 = getelementptr inbounds float* %tmp19053, i64 1
+  %tmp19055 = getelementptr inbounds float* %tmp19054, i64 1
+  %tmp19056 = getelementptr inbounds float* %tmp19055, i64 1
+  %tmp19057 = getelementptr inbounds float* %tmp19056, i64 1
+  %tmp19058 = getelementptr inbounds float* %tmp19057, i64 1
+  %tmp19059 = getelementptr inbounds float* %tmp19058, i64 1
+  %tmp19060 = getelementptr inbounds float* %tmp19059, i64 1
+  %tmp19061 = getelementptr inbounds float* %tmp19060, i64 1
+  %tmp19062 = getelementptr inbounds float* %tmp19061, i64 1
+  %tmp19063 = getelementptr inbounds float* %tmp19062, i64 1
+  %tmp19064 = getelementptr inbounds float* %tmp19063, i64 1
+  %tmp19065 = getelementptr inbounds float* %tmp19064, i64 1
+  %tmp19066 = getelementptr inbounds float* %tmp19065, i64 1
+  %tmp19067 = getelementptr inbounds float* %tmp19066, i64 1
+  %tmp19068 = getelementptr inbounds float* %tmp19067, i64 1
+  %tmp19069 = getelementptr inbounds float* %tmp19068, i64 1
+  %tmp19070 = getelementptr inbounds float* %tmp19069, i64 1
+  %tmp19071 = getelementptr inbounds float* %tmp19070, i64 1
+  %tmp19072 = getelementptr inbounds float* %tmp19071, i64 1
+  %tmp19073 = getelementptr inbounds float* %tmp19072, i64 1
+  %tmp19074 = getelementptr inbounds float* %tmp19073, i64 1
+  %tmp19075 = getelementptr inbounds float* %tmp19074, i64 1
+  %tmp19076 = getelementptr inbounds float* %tmp19075, i64 1
+  %tmp19077 = getelementptr inbounds float* %tmp19076, i64 1
+  %tmp19078 = getelementptr inbounds float* %tmp19077, i64 1
+  %tmp19079 = getelementptr inbounds float* %tmp19078, i64 1
+  %tmp19080 = getelementptr inbounds float* %tmp19079, i64 1
+  %tmp19081 = getelementptr inbounds float* %tmp19080, i64 1
+  %tmp19082 = getelementptr inbounds float* %tmp19081, i64 1
+  %tmp19083 = getelementptr inbounds float* %tmp19082, i64 1
+  %tmp19084 = getelementptr inbounds float* %tmp19083, i64 1
+  %tmp19085 = getelementptr inbounds float* %tmp19084, i64 1
+  %tmp19086 = getelementptr inbounds float* %tmp19085, i64 1
+  %tmp19087 = getelementptr inbounds float* %tmp19086, i64 1
+  %tmp19088 = getelementptr inbounds float* %tmp19087, i64 1
+  %tmp19089 = getelementptr inbounds float* %tmp19088, i64 1
+  %tmp19090 = getelementptr inbounds float* %tmp19089, i64 1
+  %tmp19091 = getelementptr inbounds float* %tmp19090, i64 1
+  %tmp19092 = getelementptr inbounds float* %tmp19091, i64 1
+  %tmp19093 = getelementptr inbounds float* %tmp19092, i64 1
+  %tmp19094 = getelementptr inbounds float* %tmp19093, i64 1
+  %tmp19095 = getelementptr inbounds float* %tmp19094, i64 1
+  %tmp19096 = getelementptr inbounds float* %tmp19095, i64 1
+  %tmp19097 = getelementptr inbounds float* %tmp19096, i64 1
+  %tmp19098 = getelementptr inbounds float* %tmp19097, i64 1
+  %tmp19099 = getelementptr inbounds float* %tmp19098, i64 1
+  %tmp19100 = getelementptr inbounds float* %tmp19099, i64 1
+  %tmp19101 = getelementptr inbounds float* %tmp19100, i64 1
+  %tmp19102 = getelementptr inbounds float* %tmp19101, i64 1
+  %tmp19103 = getelementptr inbounds float* %tmp19102, i64 1
+  %tmp19104 = getelementptr inbounds float* %tmp19103, i64 1
+  %tmp19105 = getelementptr inbounds float* %tmp19104, i64 1
+  %tmp19106 = getelementptr inbounds float* %tmp19105, i64 1
+  %tmp19107 = getelementptr inbounds float* %tmp19106, i64 1
+  %tmp19108 = getelementptr inbounds float* %tmp19107, i64 1
+  %tmp19109 = getelementptr inbounds float* %tmp19108, i64 1
+  %tmp19110 = getelementptr inbounds float* %tmp19109, i64 1
+  %tmp19111 = getelementptr inbounds float* %tmp19110, i64 1
+  %tmp19112 = getelementptr inbounds float* %tmp19111, i64 1
+  %tmp19113 = getelementptr inbounds float* %tmp19112, i64 1
+  %tmp19114 = getelementptr inbounds float* %tmp19113, i64 1
+  %tmp19115 = getelementptr inbounds float* %tmp19114, i64 1
+  %tmp19116 = getelementptr inbounds float* %tmp19115, i64 1
+  %tmp19117 = getelementptr inbounds float* %tmp19116, i64 1
+  %tmp19118 = getelementptr inbounds float* %tmp19117, i64 1
+  %tmp19119 = getelementptr inbounds float* %tmp19118, i64 1
+  %tmp19120 = getelementptr inbounds float* %tmp19119, i64 1
+  %tmp19121 = getelementptr inbounds float* %tmp19120, i64 1
+  %tmp19122 = getelementptr inbounds float* %tmp19121, i64 1
+  %tmp19123 = getelementptr inbounds float* %tmp19122, i64 1
+  %tmp19124 = getelementptr inbounds float* %tmp19123, i64 1
+  %tmp19125 = getelementptr inbounds float* %tmp19124, i64 1
+  %tmp19126 = getelementptr inbounds float* %tmp19125, i64 1
+  %tmp19127 = getelementptr inbounds float* %tmp19126, i64 1
+  %tmp19128 = getelementptr inbounds float* %tmp19127, i64 1
+  %tmp19129 = getelementptr inbounds float* %tmp19128, i64 1
+  %tmp19130 = getelementptr inbounds float* %tmp19129, i64 1
+  %tmp19131 = getelementptr inbounds float* %tmp19130, i64 1
+  %tmp19132 = getelementptr inbounds float* %tmp19131, i64 1
+  %tmp19133 = getelementptr inbounds float* %tmp19132, i64 1
+  %tmp19134 = getelementptr inbounds float* %tmp19133, i64 1
+  %tmp19135 = getelementptr inbounds float* %tmp19134, i64 1
+  %tmp19136 = getelementptr inbounds float* %tmp19135, i64 1
+  %tmp19137 = getelementptr inbounds float* %tmp19136, i64 1
+  %tmp19138 = getelementptr inbounds float* %tmp19137, i64 1
+  %tmp19139 = getelementptr inbounds float* %tmp19138, i64 1
+  %tmp19140 = getelementptr inbounds float* %tmp19139, i64 1
+  %tmp19141 = getelementptr inbounds float* %tmp19140, i64 1
+  %tmp19142 = getelementptr inbounds float* %tmp19141, i64 1
+  %tmp19143 = getelementptr inbounds float* %tmp19142, i64 1
+  %tmp19144 = getelementptr inbounds float* %tmp19143, i64 1
+  %tmp19145 = getelementptr inbounds float* %tmp19144, i64 1
+  %tmp19146 = getelementptr inbounds float* %tmp19145, i64 1
+  %tmp19147 = getelementptr inbounds float* %tmp19146, i64 1
+  %tmp19148 = getelementptr inbounds float* %tmp19147, i64 1
+  %tmp19149 = getelementptr inbounds float* %tmp19148, i64 1
+  %tmp19150 = getelementptr inbounds float* %tmp19149, i64 1
+  %tmp19151 = getelementptr inbounds float* %tmp19150, i64 1
+  %tmp19152 = getelementptr inbounds float* %tmp19151, i64 1
+  %tmp19153 = getelementptr inbounds float* %tmp19152, i64 1
+  %tmp19154 = getelementptr inbounds float* %tmp19153, i64 1
+  %tmp19155 = getelementptr inbounds float* %tmp19154, i64 1
+  %tmp19156 = getelementptr inbounds float* %tmp19155, i64 1
+  %tmp19157 = getelementptr inbounds float* %tmp19156, i64 1
+  %tmp19158 = getelementptr inbounds float* %tmp19157, i64 1
+  %tmp19159 = getelementptr inbounds float* %tmp19158, i64 1
+  %tmp19160 = getelementptr inbounds float* %tmp19159, i64 1
+  %tmp19161 = getelementptr inbounds float* %tmp19160, i64 1
+  %tmp19162 = getelementptr inbounds float* %tmp19161, i64 1
+  %tmp19163 = getelementptr inbounds float* %tmp19162, i64 1
+  %tmp19164 = getelementptr inbounds float* %tmp19163, i64 1
+  %tmp19165 = getelementptr inbounds float* %tmp19164, i64 1
+  %tmp19166 = getelementptr inbounds float* %tmp19165, i64 1
+  %tmp19167 = getelementptr inbounds float* %tmp19166, i64 1
+  %tmp19168 = getelementptr inbounds float* %tmp19167, i64 1
+  %tmp19169 = getelementptr inbounds float* %tmp19168, i64 1
+  %tmp19170 = getelementptr inbounds float* %tmp19169, i64 1
+  %tmp19171 = getelementptr inbounds float* %tmp19170, i64 1
+  %tmp19172 = getelementptr inbounds float* %tmp19171, i64 1
+  %tmp19173 = getelementptr inbounds float* %tmp19172, i64 1
+  %tmp19174 = getelementptr inbounds float* %tmp19173, i64 1
+  %tmp19175 = getelementptr inbounds float* %tmp19174, i64 1
+  %tmp19176 = getelementptr inbounds float* %tmp19175, i64 1
+  %tmp19177 = getelementptr inbounds float* %tmp19176, i64 1
+  %tmp19178 = getelementptr inbounds float* %tmp19177, i64 1
+  %tmp19179 = getelementptr inbounds float* %tmp19178, i64 1
+  %tmp19180 = getelementptr inbounds float* %tmp19179, i64 1
+  %tmp19181 = getelementptr inbounds float* %tmp19180, i64 1
+  %tmp19182 = getelementptr inbounds float* %tmp19181, i64 1
+  %tmp19183 = getelementptr inbounds float* %tmp19182, i64 1
+  %tmp19184 = getelementptr inbounds float* %tmp19183, i64 1
+  %tmp19185 = getelementptr inbounds float* %tmp19184, i64 1
+  %tmp19186 = getelementptr inbounds float* %tmp19185, i64 1
+  %tmp19187 = getelementptr inbounds float* %tmp19186, i64 1
+  %tmp19188 = getelementptr inbounds float* %tmp19187, i64 1
+  %tmp19189 = getelementptr inbounds float* %tmp19188, i64 1
+  %tmp19190 = getelementptr inbounds float* %tmp19189, i64 1
+  %tmp19191 = getelementptr inbounds float* %tmp19190, i64 1
+  %tmp19192 = getelementptr inbounds float* %tmp19191, i64 1
+  %tmp19193 = getelementptr inbounds float* %tmp19192, i64 1
+  %tmp19194 = getelementptr inbounds float* %tmp19193, i64 1
+  %tmp19195 = getelementptr inbounds float* %tmp19194, i64 1
+  %tmp19196 = getelementptr inbounds float* %tmp19195, i64 1
+  %tmp19197 = getelementptr inbounds float* %tmp19196, i64 1
+  %tmp19198 = getelementptr inbounds float* %tmp19197, i64 1
+  %tmp19199 = getelementptr inbounds float* %tmp19198, i64 1
+  %tmp19200 = getelementptr inbounds float* %tmp19199, i64 1
+  %tmp19201 = getelementptr inbounds float* %tmp19200, i64 1
+  %tmp19202 = getelementptr inbounds float* %tmp19201, i64 1
+  %tmp19203 = getelementptr inbounds float* %tmp19202, i64 1
+  %tmp19204 = getelementptr inbounds float* %tmp19203, i64 1
+  %tmp19205 = getelementptr inbounds float* %tmp19204, i64 1
+  %tmp19206 = getelementptr inbounds float* %tmp19205, i64 1
+  %tmp19207 = getelementptr inbounds float* %tmp19206, i64 1
+  %tmp19208 = getelementptr inbounds float* %tmp19207, i64 1
+  %tmp19209 = getelementptr inbounds float* %tmp19208, i64 1
+  %tmp19210 = getelementptr inbounds float* %tmp19209, i64 1
+  %tmp19211 = getelementptr inbounds float* %tmp19210, i64 1
+  %tmp19212 = getelementptr inbounds float* %tmp19211, i64 1
+  %tmp19213 = getelementptr inbounds float* %tmp19212, i64 1
+  %tmp19214 = getelementptr inbounds float* %tmp19213, i64 1
+  %tmp19215 = getelementptr inbounds float* %tmp19214, i64 1
+  %tmp19216 = getelementptr inbounds float* %tmp19215, i64 1
+  %tmp19217 = getelementptr inbounds float* %tmp19216, i64 1
+  %tmp19218 = getelementptr inbounds float* %tmp19217, i64 1
+  %tmp19219 = getelementptr inbounds float* %tmp19218, i64 1
+  %tmp19220 = getelementptr inbounds float* %tmp19219, i64 1
+  %tmp19221 = getelementptr inbounds float* %tmp19220, i64 1
+  %tmp19222 = getelementptr inbounds float* %tmp19221, i64 1
+  %tmp19223 = getelementptr inbounds float* %tmp19222, i64 1
+  %tmp19224 = getelementptr inbounds float* %tmp19223, i64 1
+  %tmp19225 = getelementptr inbounds float* %tmp19224, i64 1
+  %tmp19226 = getelementptr inbounds float* %tmp19225, i64 1
+  %tmp19227 = getelementptr inbounds float* %tmp19226, i64 1
+  %tmp19228 = getelementptr inbounds float* %tmp19227, i64 1
+  %tmp19229 = getelementptr inbounds float* %tmp19228, i64 1
+  %tmp19230 = getelementptr inbounds float* %tmp19229, i64 1
+  %tmp19231 = getelementptr inbounds float* %tmp19230, i64 1
+  %tmp19232 = getelementptr inbounds float* %tmp19231, i64 1
+  %tmp19233 = getelementptr inbounds float* %tmp19232, i64 1
+  %tmp19234 = getelementptr inbounds float* %tmp19233, i64 1
+  %tmp19235 = getelementptr inbounds float* %tmp19234, i64 1
+  %tmp19236 = getelementptr inbounds float* %tmp19235, i64 1
+  %tmp19237 = getelementptr inbounds float* %tmp19236, i64 1
+  %tmp19238 = getelementptr inbounds float* %tmp19237, i64 1
+  %tmp19239 = getelementptr inbounds float* %tmp19238, i64 1
+  %tmp19240 = getelementptr inbounds float* %tmp19239, i64 1
+  %tmp19241 = getelementptr inbounds float* %tmp19240, i64 1
+  %tmp19242 = getelementptr inbounds float* %tmp19241, i64 1
+  %tmp19243 = getelementptr inbounds float* %tmp19242, i64 1
+  %tmp19244 = getelementptr inbounds float* %tmp19243, i64 1
+  %tmp19245 = getelementptr inbounds float* %tmp19244, i64 1
+  %tmp19246 = getelementptr inbounds float* %tmp19245, i64 1
+  %tmp19247 = getelementptr inbounds float* %tmp19246, i64 1
+  %tmp19248 = getelementptr inbounds float* %tmp19247, i64 1
+  %tmp19249 = getelementptr inbounds float* %tmp19248, i64 1
+  %tmp19250 = getelementptr inbounds float* %tmp19249, i64 1
+  %tmp19251 = getelementptr inbounds float* %tmp19250, i64 1
+  %tmp19252 = getelementptr inbounds float* %tmp19251, i64 1
+  %tmp19253 = getelementptr inbounds float* %tmp19252, i64 1
+  %tmp19254 = getelementptr inbounds float* %tmp19253, i64 1
+  %tmp19255 = getelementptr inbounds float* %tmp19254, i64 1
+  %tmp19256 = getelementptr inbounds float* %tmp19255, i64 1
+  %tmp19257 = getelementptr inbounds float* %tmp19256, i64 1
+  %tmp19258 = getelementptr inbounds float* %tmp19257, i64 1
+  %tmp19259 = getelementptr inbounds float* %tmp19258, i64 1
+  %tmp19260 = getelementptr inbounds float* %tmp19259, i64 1
+  %tmp19261 = getelementptr inbounds float* %tmp19260, i64 1
+  %tmp19262 = getelementptr inbounds float* %tmp19261, i64 1
+  %tmp19263 = getelementptr inbounds float* %tmp19262, i64 1
+  %tmp19264 = getelementptr inbounds float* %tmp19263, i64 1
+  %tmp19265 = getelementptr inbounds float* %tmp19264, i64 1
+  %tmp19266 = getelementptr inbounds float* %tmp19265, i64 1
+  %tmp19267 = getelementptr inbounds float* %tmp19266, i64 1
+  %tmp19268 = getelementptr inbounds float* %tmp19267, i64 1
+  %tmp19269 = getelementptr inbounds float* %tmp19268, i64 1
+  %tmp19270 = getelementptr inbounds float* %tmp19269, i64 1
+  %tmp19271 = getelementptr inbounds float* %tmp19270, i64 1
+  %tmp19272 = getelementptr inbounds float* %tmp19271, i64 1
+  %tmp19273 = getelementptr inbounds float* %tmp19272, i64 1
+  %tmp19274 = getelementptr inbounds float* %tmp19273, i64 1
+  %tmp19275 = getelementptr inbounds float* %tmp19274, i64 1
+  %tmp19276 = getelementptr inbounds float* %tmp19275, i64 1
+  %tmp19277 = getelementptr inbounds float* %tmp19276, i64 1
+  %tmp19278 = getelementptr inbounds float* %tmp19277, i64 1
+  %tmp19279 = getelementptr inbounds float* %tmp19278, i64 1
+  %tmp19280 = getelementptr inbounds float* %tmp19279, i64 1
+  %tmp19281 = getelementptr inbounds float* %tmp19280, i64 1
+  %tmp19282 = getelementptr inbounds float* %tmp19281, i64 1
+  %tmp19283 = getelementptr inbounds float* %tmp19282, i64 1
+  %tmp19284 = getelementptr inbounds float* %tmp19283, i64 1
+  %tmp19285 = getelementptr inbounds float* %tmp19284, i64 1
+  %tmp19286 = getelementptr inbounds float* %tmp19285, i64 1
+  %tmp19287 = getelementptr inbounds float* %tmp19286, i64 1
+  %tmp19288 = getelementptr inbounds float* %tmp19287, i64 1
+  %tmp19289 = getelementptr inbounds float* %tmp19288, i64 1
+  %tmp19290 = getelementptr inbounds float* %tmp19289, i64 1
+  %tmp19291 = getelementptr inbounds float* %tmp19290, i64 1
+  %tmp19292 = getelementptr inbounds float* %tmp19291, i64 1
+  %tmp19293 = getelementptr inbounds float* %tmp19292, i64 1
+  %tmp19294 = getelementptr inbounds float* %tmp19293, i64 1
+  %tmp19295 = getelementptr inbounds float* %tmp19294, i64 1
+  %tmp19296 = getelementptr inbounds float* %tmp19295, i64 1
+  %tmp19297 = getelementptr inbounds float* %tmp19296, i64 1
+  %tmp19298 = getelementptr inbounds float* %tmp19297, i64 1
+  %tmp19299 = getelementptr inbounds float* %tmp19298, i64 1
+  %tmp19300 = getelementptr inbounds float* %tmp19299, i64 1
+  %tmp19301 = getelementptr inbounds float* %tmp19300, i64 1
+  %tmp19302 = getelementptr inbounds float* %tmp19301, i64 1
+  %tmp19303 = getelementptr inbounds float* %tmp19302, i64 1
+  %tmp19304 = getelementptr inbounds float* %tmp19303, i64 1
+  %tmp19305 = getelementptr inbounds float* %tmp19304, i64 1
+  %tmp19306 = getelementptr inbounds float* %tmp19305, i64 1
+  %tmp19307 = getelementptr inbounds float* %tmp19306, i64 1
+  %tmp19308 = getelementptr inbounds float* %tmp19307, i64 1
+  %tmp19309 = getelementptr inbounds float* %tmp19308, i64 1
+  %tmp19310 = getelementptr inbounds float* %tmp19309, i64 1
+  %tmp19311 = getelementptr inbounds float* %tmp19310, i64 1
+  %tmp19312 = getelementptr inbounds float* %tmp19311, i64 1
+  %tmp19313 = getelementptr inbounds float* %tmp19312, i64 1
+  %tmp19314 = getelementptr inbounds float* %tmp19313, i64 1
+  %tmp19315 = getelementptr inbounds float* %tmp19314, i64 1
+  %tmp19316 = getelementptr inbounds float* %tmp19315, i64 1
+  %tmp19317 = getelementptr inbounds float* %tmp19316, i64 1
+  %tmp19318 = getelementptr inbounds float* %tmp19317, i64 1
+  %tmp19319 = getelementptr inbounds float* %tmp19318, i64 1
+  %tmp19320 = getelementptr inbounds float* %tmp19319, i64 1
+  %tmp19321 = getelementptr inbounds float* %tmp19320, i64 1
+  %tmp19322 = getelementptr inbounds float* %tmp19321, i64 1
+  %tmp19323 = getelementptr inbounds float* %tmp19322, i64 1
+  %tmp19324 = getelementptr inbounds float* %tmp19323, i64 1
+  %tmp19325 = getelementptr inbounds float* %tmp19324, i64 1
+  %tmp19326 = getelementptr inbounds float* %tmp19325, i64 1
+  %tmp19327 = getelementptr inbounds float* %tmp19326, i64 1
+  %tmp19328 = getelementptr inbounds float* %tmp19327, i64 1
+  %tmp19329 = getelementptr inbounds float* %tmp19328, i64 1
+  %tmp19330 = getelementptr inbounds float* %tmp19329, i64 1
+  %tmp19331 = getelementptr inbounds float* %tmp19330, i64 1
+  %tmp19332 = getelementptr inbounds float* %tmp19331, i64 1
+  %tmp19333 = getelementptr inbounds float* %tmp19332, i64 1
+  %tmp19334 = getelementptr inbounds float* %tmp19333, i64 1
+  %tmp19335 = getelementptr inbounds float* %tmp19334, i64 1
+  %tmp19336 = getelementptr inbounds float* %tmp19335, i64 1
+  %tmp19337 = getelementptr inbounds float* %tmp19336, i64 1
+  %tmp19338 = getelementptr inbounds float* %tmp19337, i64 1
+  %tmp19339 = getelementptr inbounds float* %tmp19338, i64 1
+  %tmp19340 = getelementptr inbounds float* %tmp19339, i64 1
+  %tmp19341 = getelementptr inbounds float* %tmp19340, i64 1
+  %tmp19342 = getelementptr inbounds float* %tmp19341, i64 1
+  %tmp19343 = getelementptr inbounds float* %tmp19342, i64 1
+  %tmp19344 = getelementptr inbounds float* %tmp19343, i64 1
+  %tmp19345 = getelementptr inbounds float* %tmp19344, i64 1
+  %tmp19346 = getelementptr inbounds float* %tmp19345, i64 1
+  %tmp19347 = getelementptr inbounds float* %tmp19346, i64 1
+  %tmp19348 = getelementptr inbounds float* %tmp19347, i64 1
+  %tmp19349 = getelementptr inbounds float* %tmp19348, i64 1
+  %tmp19350 = getelementptr inbounds float* %tmp19349, i64 1
+  %tmp19351 = getelementptr inbounds float* %tmp19350, i64 1
+  %tmp19352 = getelementptr inbounds float* %tmp19351, i64 1
+  %tmp19353 = getelementptr inbounds float* %tmp19352, i64 1
+  %tmp19354 = getelementptr inbounds float* %tmp19353, i64 1
+  %tmp19355 = getelementptr inbounds float* %tmp19354, i64 1
+  %tmp19356 = getelementptr inbounds float* %tmp19355, i64 1
+  %tmp19357 = getelementptr inbounds float* %tmp19356, i64 1
+  %tmp19358 = getelementptr inbounds float* %tmp19357, i64 1
+  %tmp19359 = getelementptr inbounds float* %tmp19358, i64 1
+  %tmp19360 = getelementptr inbounds float* %tmp19359, i64 1
+  %tmp19361 = getelementptr inbounds float* %tmp19360, i64 1
+  %tmp19362 = getelementptr inbounds float* %tmp19361, i64 1
+  %tmp19363 = getelementptr inbounds float* %tmp19362, i64 1
+  %tmp19364 = getelementptr inbounds float* %tmp19363, i64 1
+  %tmp19365 = getelementptr inbounds float* %tmp19364, i64 1
+  %tmp19366 = getelementptr inbounds float* %tmp19365, i64 1
+  %tmp19367 = getelementptr inbounds float* %tmp19366, i64 1
+  %tmp19368 = getelementptr inbounds float* %tmp19367, i64 1
+  %tmp19369 = getelementptr inbounds float* %tmp19368, i64 1
+  %tmp19370 = getelementptr inbounds float* %tmp19369, i64 1
+  %tmp19371 = getelementptr inbounds float* %tmp19370, i64 1
+  %tmp19372 = getelementptr inbounds float* %tmp19371, i64 1
+  %tmp19373 = getelementptr inbounds float* %tmp19372, i64 1
+  %tmp19374 = getelementptr inbounds float* %tmp19373, i64 1
+  %tmp19375 = getelementptr inbounds float* %tmp19374, i64 1
+  %tmp19376 = getelementptr inbounds float* %tmp19375, i64 1
+  %tmp19377 = getelementptr inbounds float* %tmp19376, i64 1
+  %tmp19378 = getelementptr inbounds float* %tmp19377, i64 1
+  %tmp19379 = getelementptr inbounds float* %tmp19378, i64 1
+  %tmp19380 = getelementptr inbounds float* %tmp19379, i64 1
+  %tmp19381 = getelementptr inbounds float* %tmp19380, i64 1
+  %tmp19382 = getelementptr inbounds float* %tmp19381, i64 1
+  %tmp19383 = getelementptr inbounds float* %tmp19382, i64 1
+  %tmp19384 = getelementptr inbounds float* %tmp19383, i64 1
+  %tmp19385 = getelementptr inbounds float* %tmp19384, i64 1
+  %tmp19386 = getelementptr inbounds float* %tmp19385, i64 1
+  %tmp19387 = getelementptr inbounds float* %tmp19386, i64 1
+  %tmp19388 = getelementptr inbounds float* %tmp19387, i64 1
+  %tmp19389 = getelementptr inbounds float* %tmp19388, i64 1
+  %tmp19390 = getelementptr inbounds float* %tmp19389, i64 1
+  %tmp19391 = getelementptr inbounds float* %tmp19390, i64 1
+  %tmp19392 = getelementptr inbounds float* %tmp19391, i64 1
+  %tmp19393 = getelementptr inbounds float* %tmp19392, i64 1
+  %tmp19394 = getelementptr inbounds float* %tmp19393, i64 1
+  %tmp19395 = getelementptr inbounds float* %tmp19394, i64 1
+  %tmp19396 = getelementptr inbounds float* %tmp19395, i64 1
+  %tmp19397 = getelementptr inbounds float* %tmp19396, i64 1
+  %tmp19398 = getelementptr inbounds float* %tmp19397, i64 1
+  %tmp19399 = getelementptr inbounds float* %tmp19398, i64 1
+  %tmp19400 = getelementptr inbounds float* %tmp19399, i64 1
+  %tmp19401 = getelementptr inbounds float* %tmp19400, i64 1
+  %tmp19402 = getelementptr inbounds float* %tmp19401, i64 1
+  %tmp19403 = getelementptr inbounds float* %tmp19402, i64 1
+  %tmp19404 = getelementptr inbounds float* %tmp19403, i64 1
+  %tmp19405 = getelementptr inbounds float* %tmp19404, i64 1
+  %tmp19406 = getelementptr inbounds float* %tmp19405, i64 1
+  %tmp19407 = getelementptr inbounds float* %tmp19406, i64 1
+  %tmp19408 = getelementptr inbounds float* %tmp19407, i64 1
+  %tmp19409 = getelementptr inbounds float* %tmp19408, i64 1
+  %tmp19410 = getelementptr inbounds float* %tmp19409, i64 1
+  %tmp19411 = getelementptr inbounds float* %tmp19410, i64 1
+  %tmp19412 = getelementptr inbounds float* %tmp19411, i64 1
+  %tmp19413 = getelementptr inbounds float* %tmp19412, i64 1
+  %tmp19414 = getelementptr inbounds float* %tmp19413, i64 1
+  %tmp19415 = getelementptr inbounds float* %tmp19414, i64 1
+  %tmp19416 = getelementptr inbounds float* %tmp19415, i64 1
+  %tmp19417 = getelementptr inbounds float* %tmp19416, i64 1
+  %tmp19418 = getelementptr inbounds float* %tmp19417, i64 1
+  %tmp19419 = getelementptr inbounds float* %tmp19418, i64 1
+  %tmp19420 = getelementptr inbounds float* %tmp19419, i64 1
+  %tmp19421 = getelementptr inbounds float* %tmp19420, i64 1
+  %tmp19422 = getelementptr inbounds float* %tmp19421, i64 1
+  %tmp19423 = getelementptr inbounds float* %tmp19422, i64 1
+  %tmp19424 = getelementptr inbounds float* %tmp19423, i64 1
+  %tmp19425 = getelementptr inbounds float* %tmp19424, i64 1
+  %tmp19426 = getelementptr inbounds float* %tmp19425, i64 1
+  %tmp19427 = getelementptr inbounds float* %tmp19426, i64 1
+  %tmp19428 = getelementptr inbounds float* %tmp19427, i64 1
+  %tmp19429 = getelementptr inbounds float* %tmp19428, i64 1
+  %tmp19430 = getelementptr inbounds float* %tmp19429, i64 1
+  %tmp19431 = getelementptr inbounds float* %tmp19430, i64 1
+  %tmp19432 = getelementptr inbounds float* %tmp19431, i64 1
+  %tmp19433 = getelementptr inbounds float* %tmp19432, i64 1
+  %tmp19434 = getelementptr inbounds float* %tmp19433, i64 1
+  %tmp19435 = getelementptr inbounds float* %tmp19434, i64 1
+  %tmp19436 = getelementptr inbounds float* %tmp19435, i64 1
+  %tmp19437 = getelementptr inbounds float* %tmp19436, i64 1
+  %tmp19438 = getelementptr inbounds float* %tmp19437, i64 1
+  %tmp19439 = getelementptr inbounds float* %tmp19438, i64 1
+  %tmp19440 = getelementptr inbounds float* %tmp19439, i64 1
+  %tmp19441 = getelementptr inbounds float* %tmp19440, i64 1
+  %tmp19442 = getelementptr inbounds float* %tmp19441, i64 1
+  %tmp19443 = getelementptr inbounds float* %tmp19442, i64 1
+  %tmp19444 = getelementptr inbounds float* %tmp19443, i64 1
+  %tmp19445 = getelementptr inbounds float* %tmp19444, i64 1
+  %tmp19446 = getelementptr inbounds float* %tmp19445, i64 1
+  %tmp19447 = getelementptr inbounds float* %tmp19446, i64 1
+  %tmp19448 = getelementptr inbounds float* %tmp19447, i64 1
+  %tmp19449 = getelementptr inbounds float* %tmp19448, i64 1
+  %tmp19450 = getelementptr inbounds float* %tmp19449, i64 1
+  %tmp19451 = getelementptr inbounds float* %tmp19450, i64 1
+  %tmp19452 = getelementptr inbounds float* %tmp19451, i64 1
+  %tmp19453 = getelementptr inbounds float* %tmp19452, i64 1
+  %tmp19454 = getelementptr inbounds float* %tmp19453, i64 1
+  %tmp19455 = getelementptr inbounds float* %tmp19454, i64 1
+  %tmp19456 = getelementptr inbounds float* %tmp19455, i64 1
+  %tmp19457 = getelementptr inbounds float* %tmp19456, i64 1
+  %tmp19458 = getelementptr inbounds float* %tmp19457, i64 1
+  %tmp19459 = getelementptr inbounds float* %tmp19458, i64 1
+  %tmp19460 = getelementptr inbounds float* %tmp19459, i64 1
+  %tmp19461 = getelementptr inbounds float* %tmp19460, i64 1
+  %tmp19462 = getelementptr inbounds float* %tmp19461, i64 1
+  %tmp19463 = getelementptr inbounds float* %tmp19462, i64 1
+  %tmp19464 = getelementptr inbounds float* %tmp19463, i64 1
+  %tmp19465 = getelementptr inbounds float* %tmp19464, i64 1
+  %tmp19466 = getelementptr inbounds float* %tmp19465, i64 1
+  %tmp19467 = getelementptr inbounds float* %tmp19466, i64 1
+  %tmp19468 = getelementptr inbounds float* %tmp19467, i64 1
+  %tmp19469 = getelementptr inbounds float* %tmp19468, i64 1
+  %tmp19470 = getelementptr inbounds float* %tmp19469, i64 1
+  %tmp19471 = getelementptr inbounds float* %tmp19470, i64 1
+  %tmp19472 = getelementptr inbounds float* %tmp19471, i64 1
+  %tmp19473 = getelementptr inbounds float* %tmp19472, i64 1
+  %tmp19474 = getelementptr inbounds float* %tmp19473, i64 1
+  %tmp19475 = getelementptr inbounds float* %tmp19474, i64 1
+  %tmp19476 = getelementptr inbounds float* %tmp19475, i64 1
+  %tmp19477 = getelementptr inbounds float* %tmp19476, i64 1
+  %tmp19478 = getelementptr inbounds float* %tmp19477, i64 1
+  %tmp19479 = getelementptr inbounds float* %tmp19478, i64 1
+  %tmp19480 = getelementptr inbounds float* %tmp19479, i64 1
+  %tmp19481 = getelementptr inbounds float* %tmp19480, i64 1
+  %tmp19482 = getelementptr inbounds float* %tmp19481, i64 1
+  %tmp19483 = getelementptr inbounds float* %tmp19482, i64 1
+  %tmp19484 = getelementptr inbounds float* %tmp19483, i64 1
+  %tmp19485 = getelementptr inbounds float* %tmp19484, i64 1
+  %tmp19486 = getelementptr inbounds float* %tmp19485, i64 1
+  %tmp19487 = getelementptr inbounds float* %tmp19486, i64 1
+  %tmp19488 = getelementptr inbounds float* %tmp19487, i64 1
+  %tmp19489 = getelementptr inbounds float* %tmp19488, i64 1
+  %tmp19490 = getelementptr inbounds float* %tmp19489, i64 1
+  %tmp19491 = getelementptr inbounds float* %tmp19490, i64 1
+  %tmp19492 = getelementptr inbounds float* %tmp19491, i64 1
+  %tmp19493 = getelementptr inbounds float* %tmp19492, i64 1
+  %tmp19494 = getelementptr inbounds float* %tmp19493, i64 1
+  %tmp19495 = getelementptr inbounds float* %tmp19494, i64 1
+  %tmp19496 = getelementptr inbounds float* %tmp19495, i64 1
+  %tmp19497 = getelementptr inbounds float* %tmp19496, i64 1
+  %tmp19498 = getelementptr inbounds float* %tmp19497, i64 1
+  %tmp19499 = getelementptr inbounds float* %tmp19498, i64 1
+  %tmp19500 = getelementptr inbounds float* %tmp19499, i64 1
+  %tmp19501 = getelementptr inbounds float* %tmp19500, i64 1
+  %tmp19502 = getelementptr inbounds float* %tmp19501, i64 1
+  %tmp19503 = getelementptr inbounds float* %tmp19502, i64 1
+  %tmp19504 = getelementptr inbounds float* %tmp19503, i64 1
+  %tmp19505 = getelementptr inbounds float* %tmp19504, i64 1
+  %tmp19506 = getelementptr inbounds float* %tmp19505, i64 1
+  %tmp19507 = getelementptr inbounds float* %tmp19506, i64 1
+  %tmp19508 = getelementptr inbounds float* %tmp19507, i64 1
+  %tmp19509 = getelementptr inbounds float* %tmp19508, i64 1
+  %tmp19510 = getelementptr inbounds float* %tmp19509, i64 1
+  %tmp19511 = getelementptr inbounds float* %tmp19510, i64 1
+  %tmp19512 = getelementptr inbounds float* %tmp19511, i64 1
+  %tmp19513 = getelementptr inbounds float* %tmp19512, i64 1
+  %tmp19514 = getelementptr inbounds float* %tmp19513, i64 1
+  %tmp19515 = getelementptr inbounds float* %tmp19514, i64 1
+  %tmp19516 = getelementptr inbounds float* %tmp19515, i64 1
+  %tmp19517 = getelementptr inbounds float* %tmp19516, i64 1
+  %tmp19518 = getelementptr inbounds float* %tmp19517, i64 1
+  %tmp19519 = getelementptr inbounds float* %tmp19518, i64 1
+  %tmp19520 = getelementptr inbounds float* %tmp19519, i64 1
+  %tmp19521 = getelementptr inbounds float* %tmp19520, i64 1
+  %tmp19522 = getelementptr inbounds float* %tmp19521, i64 1
+  %tmp19523 = getelementptr inbounds float* %tmp19522, i64 1
+  %tmp19524 = getelementptr inbounds float* %tmp19523, i64 1
+  %tmp19525 = getelementptr inbounds float* %tmp19524, i64 1
+  %tmp19526 = getelementptr inbounds float* %tmp19525, i64 1
+  %tmp19527 = getelementptr inbounds float* %tmp19526, i64 1
+  %tmp19528 = getelementptr inbounds float* %tmp19527, i64 1
+  %tmp19529 = getelementptr inbounds float* %tmp19528, i64 1
+  %tmp19530 = getelementptr inbounds float* %tmp19529, i64 1
+  %tmp19531 = getelementptr inbounds float* %tmp19530, i64 1
+  %tmp19532 = getelementptr inbounds float* %tmp19531, i64 1
+  %tmp19533 = getelementptr inbounds float* %tmp19532, i64 1
+  %tmp19534 = getelementptr inbounds float* %tmp19533, i64 1
+  %tmp19535 = getelementptr inbounds float* %tmp19534, i64 1
+  %tmp19536 = getelementptr inbounds float* %tmp19535, i64 1
+  %tmp19537 = getelementptr inbounds float* %tmp19536, i64 1
+  %tmp19538 = getelementptr inbounds float* %tmp19537, i64 1
+  %tmp19539 = getelementptr inbounds float* %tmp19538, i64 1
+  %tmp19540 = getelementptr inbounds float* %tmp19539, i64 1
+  %tmp19541 = getelementptr inbounds float* %tmp19540, i64 1
+  %tmp19542 = getelementptr inbounds float* %tmp19541, i64 1
+  %tmp19543 = getelementptr inbounds float* %tmp19542, i64 1
+  %tmp19544 = getelementptr inbounds float* %tmp19543, i64 1
+  %tmp19545 = getelementptr inbounds float* %tmp19544, i64 1
+  %tmp19546 = getelementptr inbounds float* %tmp19545, i64 1
+  %tmp19547 = getelementptr inbounds float* %tmp19546, i64 1
+  %tmp19548 = getelementptr inbounds float* %tmp19547, i64 1
+  %tmp19549 = getelementptr inbounds float* %tmp19548, i64 1
+  %tmp19550 = getelementptr inbounds float* %tmp19549, i64 1
+  %tmp19551 = getelementptr inbounds float* %tmp19550, i64 1
+  %tmp19552 = getelementptr inbounds float* %tmp19551, i64 1
+  %tmp19553 = getelementptr inbounds float* %tmp19552, i64 1
+  %tmp19554 = getelementptr inbounds float* %tmp19553, i64 1
+  %tmp19555 = getelementptr inbounds float* %tmp19554, i64 1
+  %tmp19556 = getelementptr inbounds float* %tmp19555, i64 1
+  %tmp19557 = getelementptr inbounds float* %tmp19556, i64 1
+  %tmp19558 = getelementptr inbounds float* %tmp19557, i64 1
+  %tmp19559 = getelementptr inbounds float* %tmp19558, i64 1
+  %tmp19560 = getelementptr inbounds float* %tmp19559, i64 1
+  %tmp19561 = getelementptr inbounds float* %tmp19560, i64 1
+  %tmp19562 = getelementptr inbounds float* %tmp19561, i64 1
+  %tmp19563 = getelementptr inbounds float* %tmp19562, i64 1
+  %tmp19564 = getelementptr inbounds float* %tmp19563, i64 1
+  %tmp19565 = getelementptr inbounds float* %tmp19564, i64 1
+  %tmp19566 = getelementptr inbounds float* %tmp19565, i64 1
+  %tmp19567 = getelementptr inbounds float* %tmp19566, i64 1
+  %tmp19568 = getelementptr inbounds float* %tmp19567, i64 1
+  %tmp19569 = getelementptr inbounds float* %tmp19568, i64 1
+  %tmp19570 = getelementptr inbounds float* %tmp19569, i64 1
+  %tmp19571 = getelementptr inbounds float* %tmp19570, i64 1
+  %tmp19572 = getelementptr inbounds float* %tmp19571, i64 1
+  %tmp19573 = getelementptr inbounds float* %tmp19572, i64 1
+  %tmp19574 = getelementptr inbounds float* %tmp19573, i64 1
+  %tmp19575 = getelementptr inbounds float* %tmp19574, i64 1
+  %tmp19576 = getelementptr inbounds float* %tmp19575, i64 1
+  %tmp19577 = getelementptr inbounds float* %tmp19576, i64 1
+  %tmp19578 = getelementptr inbounds float* %tmp19577, i64 1
+  %tmp19579 = getelementptr inbounds float* %tmp19578, i64 1
+  %tmp19580 = getelementptr inbounds float* %tmp19579, i64 1
+  %tmp19581 = getelementptr inbounds float* %tmp19580, i64 1
+  %tmp19582 = getelementptr inbounds float* %tmp19581, i64 1
+  %tmp19583 = getelementptr inbounds float* %tmp19582, i64 1
+  %tmp19584 = getelementptr inbounds float* %tmp19583, i64 1
+  %tmp19585 = getelementptr inbounds float* %tmp19584, i64 1
+  %tmp19586 = getelementptr inbounds float* %tmp19585, i64 1
+  %tmp19587 = getelementptr inbounds float* %tmp19586, i64 1
+  %tmp19588 = getelementptr inbounds float* %tmp19587, i64 1
+  %tmp19589 = getelementptr inbounds float* %tmp19588, i64 1
+  %tmp19590 = getelementptr inbounds float* %tmp19589, i64 1
+  %tmp19591 = getelementptr inbounds float* %tmp19590, i64 1
+  %tmp19592 = getelementptr inbounds float* %tmp19591, i64 1
+  %tmp19593 = getelementptr inbounds float* %tmp19592, i64 1
+  %tmp19594 = getelementptr inbounds float* %tmp19593, i64 1
+  %tmp19595 = getelementptr inbounds float* %tmp19594, i64 1
+  %tmp19596 = getelementptr inbounds float* %tmp19595, i64 1
+  %tmp19597 = getelementptr inbounds float* %tmp19596, i64 1
+  %tmp19598 = getelementptr inbounds float* %tmp19597, i64 1
+  %tmp19599 = getelementptr inbounds float* %tmp19598, i64 1
+  %tmp19600 = getelementptr inbounds float* %tmp19599, i64 1
+  %tmp19601 = getelementptr inbounds float* %tmp19600, i64 1
+  %tmp19602 = getelementptr inbounds float* %tmp19601, i64 1
+  %tmp19603 = getelementptr inbounds float* %tmp19602, i64 1
+  %tmp19604 = getelementptr inbounds float* %tmp19603, i64 1
+  %tmp19605 = getelementptr inbounds float* %tmp19604, i64 1
+  %tmp19606 = getelementptr inbounds float* %tmp19605, i64 1
+  %tmp19607 = getelementptr inbounds float* %tmp19606, i64 1
+  %tmp19608 = getelementptr inbounds float* %tmp19607, i64 1
+  %tmp19609 = getelementptr inbounds float* %tmp19608, i64 1
+  %tmp19610 = getelementptr inbounds float* %tmp19609, i64 1
+  %tmp19611 = getelementptr inbounds float* %tmp19610, i64 1
+  %tmp19612 = getelementptr inbounds float* %tmp19611, i64 1
+  %tmp19613 = getelementptr inbounds float* %tmp19612, i64 1
+  %tmp19614 = getelementptr inbounds float* %tmp19613, i64 1
+  %tmp19615 = getelementptr inbounds float* %tmp19614, i64 1
+  %tmp19616 = getelementptr inbounds float* %tmp19615, i64 1
+  %tmp19617 = getelementptr inbounds float* %tmp19616, i64 1
+  %tmp19618 = getelementptr inbounds float* %tmp19617, i64 1
+  %tmp19619 = getelementptr inbounds float* %tmp19618, i64 1
+  %tmp19620 = getelementptr inbounds float* %tmp19619, i64 1
+  %tmp19621 = getelementptr inbounds float* %tmp19620, i64 1
+  %tmp19622 = getelementptr inbounds float* %tmp19621, i64 1
+  %tmp19623 = getelementptr inbounds float* %tmp19622, i64 1
+  %tmp19624 = getelementptr inbounds float* %tmp19623, i64 1
+  %tmp19625 = getelementptr inbounds float* %tmp19624, i64 1
+  %tmp19626 = getelementptr inbounds float* %tmp19625, i64 1
+  %tmp19627 = getelementptr inbounds float* %tmp19626, i64 1
+  %tmp19628 = getelementptr inbounds float* %tmp19627, i64 1
+  %tmp19629 = getelementptr inbounds float* %tmp19628, i64 1
+  %tmp19630 = getelementptr inbounds float* %tmp19629, i64 1
+  %tmp19631 = getelementptr inbounds float* %tmp19630, i64 1
+  %tmp19632 = getelementptr inbounds float* %tmp19631, i64 1
+  %tmp19633 = getelementptr inbounds float* %tmp19632, i64 1
+  %tmp19634 = getelementptr inbounds float* %tmp19633, i64 1
+  %tmp19635 = getelementptr inbounds float* %tmp19634, i64 1
+  %tmp19636 = getelementptr inbounds float* %tmp19635, i64 1
+  %tmp19637 = getelementptr inbounds float* %tmp19636, i64 1
+  %tmp19638 = getelementptr inbounds float* %tmp19637, i64 1
+  %tmp19639 = getelementptr inbounds float* %tmp19638, i64 1
+  %tmp19640 = getelementptr inbounds float* %tmp19639, i64 1
+  %tmp19641 = getelementptr inbounds float* %tmp19640, i64 1
+  %tmp19642 = getelementptr inbounds float* %tmp19641, i64 1
+  %tmp19643 = getelementptr inbounds float* %tmp19642, i64 1
+  %tmp19644 = getelementptr inbounds float* %tmp19643, i64 1
+  %tmp19645 = getelementptr inbounds float* %tmp19644, i64 1
+  %tmp19646 = getelementptr inbounds float* %tmp19645, i64 1
+  %tmp19647 = getelementptr inbounds float* %tmp19646, i64 1
+  %tmp19648 = getelementptr inbounds float* %tmp19647, i64 1
+  %tmp19649 = getelementptr inbounds float* %tmp19648, i64 1
+  %tmp19650 = getelementptr inbounds float* %tmp19649, i64 1
+  %tmp19651 = getelementptr inbounds float* %tmp19650, i64 1
+  %tmp19652 = getelementptr inbounds float* %tmp19651, i64 1
+  %tmp19653 = getelementptr inbounds float* %tmp19652, i64 1
+  %tmp19654 = getelementptr inbounds float* %tmp19653, i64 1
+  %tmp19655 = getelementptr inbounds float* %tmp19654, i64 1
+  %tmp19656 = getelementptr inbounds float* %tmp19655, i64 1
+  %tmp19657 = getelementptr inbounds float* %tmp19656, i64 1
+  %tmp19658 = getelementptr inbounds float* %tmp19657, i64 1
+  %tmp19659 = getelementptr inbounds float* %tmp19658, i64 1
+  %tmp19660 = getelementptr inbounds float* %tmp19659, i64 1
+  %tmp19661 = getelementptr inbounds float* %tmp19660, i64 1
+  %tmp19662 = getelementptr inbounds float* %tmp19661, i64 1
+  %tmp19663 = getelementptr inbounds float* %tmp19662, i64 1
+  %tmp19664 = getelementptr inbounds float* %tmp19663, i64 1
+  %tmp19665 = getelementptr inbounds float* %tmp19664, i64 1
+  %tmp19666 = getelementptr inbounds float* %tmp19665, i64 1
+  %tmp19667 = getelementptr inbounds float* %tmp19666, i64 1
+  %tmp19668 = getelementptr inbounds float* %tmp19667, i64 1
+  %tmp19669 = getelementptr inbounds float* %tmp19668, i64 1
+  %tmp19670 = getelementptr inbounds float* %tmp19669, i64 1
+  %tmp19671 = getelementptr inbounds float* %tmp19670, i64 1
+  %tmp19672 = getelementptr inbounds float* %tmp19671, i64 1
+  %tmp19673 = getelementptr inbounds float* %tmp19672, i64 1
+  %tmp19674 = getelementptr inbounds float* %tmp19673, i64 1
+  %tmp19675 = getelementptr inbounds float* %tmp19674, i64 1
+  %tmp19676 = getelementptr inbounds float* %tmp19675, i64 1
+  %tmp19677 = getelementptr inbounds float* %tmp19676, i64 1
+  %tmp19678 = getelementptr inbounds float* %tmp19677, i64 1
+  %tmp19679 = getelementptr inbounds float* %tmp19678, i64 1
+  %tmp19680 = getelementptr inbounds float* %tmp19679, i64 1
+  %tmp19681 = getelementptr inbounds float* %tmp19680, i64 1
+  %tmp19682 = getelementptr inbounds float* %tmp19681, i64 1
+  %tmp19683 = getelementptr inbounds float* %tmp19682, i64 1
+  %tmp19684 = getelementptr inbounds float* %tmp19683, i64 1
+  %tmp19685 = getelementptr inbounds float* %tmp19684, i64 1
+  %tmp19686 = getelementptr inbounds float* %tmp19685, i64 1
+  %tmp19687 = getelementptr inbounds float* %tmp19686, i64 1
+  %tmp19688 = getelementptr inbounds float* %tmp19687, i64 1
+  %tmp19689 = getelementptr inbounds float* %tmp19688, i64 1
+  %tmp19690 = getelementptr inbounds float* %tmp19689, i64 1
+  %tmp19691 = getelementptr inbounds float* %tmp19690, i64 1
+  %tmp19692 = getelementptr inbounds float* %tmp19691, i64 1
+  %tmp19693 = getelementptr inbounds float* %tmp19692, i64 1
+  %tmp19694 = getelementptr inbounds float* %tmp19693, i64 1
+  %tmp19695 = getelementptr inbounds float* %tmp19694, i64 1
+  %tmp19696 = getelementptr inbounds float* %tmp19695, i64 1
+  %tmp19697 = getelementptr inbounds float* %tmp19696, i64 1
+  %tmp19698 = getelementptr inbounds float* %tmp19697, i64 1
+  %tmp19699 = getelementptr inbounds float* %tmp19698, i64 1
+  %tmp19700 = getelementptr inbounds float* %tmp19699, i64 1
+  %tmp19701 = getelementptr inbounds float* %tmp19700, i64 1
+  %tmp19702 = getelementptr inbounds float* %tmp19701, i64 1
+  %tmp19703 = getelementptr inbounds float* %tmp19702, i64 1
+  %tmp19704 = getelementptr inbounds float* %tmp19703, i64 1
+  %tmp19705 = getelementptr inbounds float* %tmp19704, i64 1
+  %tmp19706 = getelementptr inbounds float* %tmp19705, i64 1
+  %tmp19707 = getelementptr inbounds float* %tmp19706, i64 1
+  %tmp19708 = getelementptr inbounds float* %tmp19707, i64 1
+  %tmp19709 = getelementptr inbounds float* %tmp19708, i64 1
+  %tmp19710 = getelementptr inbounds float* %tmp19709, i64 1
+  %tmp19711 = getelementptr inbounds float* %tmp19710, i64 1
+  %tmp19712 = getelementptr inbounds float* %tmp19711, i64 1
+  %tmp19713 = getelementptr inbounds float* %tmp19712, i64 1
+  %tmp19714 = getelementptr inbounds float* %tmp19713, i64 1
+  %tmp19715 = getelementptr inbounds float* %tmp19714, i64 1
+  %tmp19716 = getelementptr inbounds float* %tmp19715, i64 1
+  %tmp19717 = getelementptr inbounds float* %tmp19716, i64 1
+  %tmp19718 = getelementptr inbounds float* %tmp19717, i64 1
+  %tmp19719 = getelementptr inbounds float* %tmp19718, i64 1
+  %tmp19720 = getelementptr inbounds float* %tmp19719, i64 1
+  %tmp19721 = getelementptr inbounds float* %tmp19720, i64 1
+  %tmp19722 = getelementptr inbounds float* %tmp19721, i64 1
+  %tmp19723 = getelementptr inbounds float* %tmp19722, i64 1
+  %tmp19724 = getelementptr inbounds float* %tmp19723, i64 1
+  %tmp19725 = getelementptr inbounds float* %tmp19724, i64 1
+  %tmp19726 = getelementptr inbounds float* %tmp19725, i64 1
+  %tmp19727 = getelementptr inbounds float* %tmp19726, i64 1
+  %tmp19728 = getelementptr inbounds float* %tmp19727, i64 1
+  %tmp19729 = getelementptr inbounds float* %tmp19728, i64 1
+  %tmp19730 = getelementptr inbounds float* %tmp19729, i64 1
+  %tmp19731 = getelementptr inbounds float* %tmp19730, i64 1
+  %tmp19732 = getelementptr inbounds float* %tmp19731, i64 1
+  %tmp19733 = getelementptr inbounds float* %tmp19732, i64 1
+  %tmp19734 = getelementptr inbounds float* %tmp19733, i64 1
+  %tmp19735 = getelementptr inbounds float* %tmp19734, i64 1
+  %tmp19736 = getelementptr inbounds float* %tmp19735, i64 1
+  %tmp19737 = getelementptr inbounds float* %tmp19736, i64 1
+  %tmp19738 = getelementptr inbounds float* %tmp19737, i64 1
+  %tmp19739 = getelementptr inbounds float* %tmp19738, i64 1
+  %tmp19740 = getelementptr inbounds float* %tmp19739, i64 1
+  %tmp19741 = getelementptr inbounds float* %tmp19740, i64 1
+  %tmp19742 = getelementptr inbounds float* %tmp19741, i64 1
+  %tmp19743 = getelementptr inbounds float* %tmp19742, i64 1
+  %tmp19744 = getelementptr inbounds float* %tmp19743, i64 1
+  %tmp19745 = getelementptr inbounds float* %tmp19744, i64 1
+  %tmp19746 = getelementptr inbounds float* %tmp19745, i64 1
+  %tmp19747 = getelementptr inbounds float* %tmp19746, i64 1
+  %tmp19748 = getelementptr inbounds float* %tmp19747, i64 1
+  %tmp19749 = getelementptr inbounds float* %tmp19748, i64 1
+  %tmp19750 = getelementptr inbounds float* %tmp19749, i64 1
+  %tmp19751 = getelementptr inbounds float* %tmp19750, i64 1
+  %tmp19752 = getelementptr inbounds float* %tmp19751, i64 1
+  %tmp19753 = getelementptr inbounds float* %tmp19752, i64 1
+  %tmp19754 = getelementptr inbounds float* %tmp19753, i64 1
+  %tmp19755 = getelementptr inbounds float* %tmp19754, i64 1
+  %tmp19756 = getelementptr inbounds float* %tmp19755, i64 1
+  %tmp19757 = getelementptr inbounds float* %tmp19756, i64 1
+  %tmp19758 = getelementptr inbounds float* %tmp19757, i64 1
+  %tmp19759 = getelementptr inbounds float* %tmp19758, i64 1
+  %tmp19760 = getelementptr inbounds float* %tmp19759, i64 1
+  %tmp19761 = getelementptr inbounds float* %tmp19760, i64 1
+  %tmp19762 = getelementptr inbounds float* %tmp19761, i64 1
+  %tmp19763 = getelementptr inbounds float* %tmp19762, i64 1
+  %tmp19764 = getelementptr inbounds float* %tmp19763, i64 1
+  %tmp19765 = getelementptr inbounds float* %tmp19764, i64 1
+  %tmp19766 = getelementptr inbounds float* %tmp19765, i64 1
+  %tmp19767 = getelementptr inbounds float* %tmp19766, i64 1
+  %tmp19768 = getelementptr inbounds float* %tmp19767, i64 1
+  %tmp19769 = getelementptr inbounds float* %tmp19768, i64 1
+  %tmp19770 = getelementptr inbounds float* %tmp19769, i64 1
+  %tmp19771 = getelementptr inbounds float* %tmp19770, i64 1
+  %tmp19772 = getelementptr inbounds float* %tmp19771, i64 1
+  %tmp19773 = getelementptr inbounds float* %tmp19772, i64 1
+  %tmp19774 = getelementptr inbounds float* %tmp19773, i64 1
+  %tmp19775 = getelementptr inbounds float* %tmp19774, i64 1
+  %tmp19776 = getelementptr inbounds float* %tmp19775, i64 1
+  %tmp19777 = getelementptr inbounds float* %tmp19776, i64 1
+  %tmp19778 = getelementptr inbounds float* %tmp19777, i64 1
+  %tmp19779 = getelementptr inbounds float* %tmp19778, i64 1
+  %tmp19780 = getelementptr inbounds float* %tmp19779, i64 1
+  %tmp19781 = getelementptr inbounds float* %tmp19780, i64 1
+  %tmp19782 = getelementptr inbounds float* %tmp19781, i64 1
+  %tmp19783 = getelementptr inbounds float* %tmp19782, i64 1
+  %tmp19784 = getelementptr inbounds float* %tmp19783, i64 1
+  %tmp19785 = getelementptr inbounds float* %tmp19784, i64 1
+  %tmp19786 = getelementptr inbounds float* %tmp19785, i64 1
+  %tmp19787 = getelementptr inbounds float* %tmp19786, i64 1
+  %tmp19788 = getelementptr inbounds float* %tmp19787, i64 1
+  %tmp19789 = getelementptr inbounds float* %tmp19788, i64 1
+  %tmp19790 = getelementptr inbounds float* %tmp19789, i64 1
+  %tmp19791 = getelementptr inbounds float* %tmp19790, i64 1
+  %tmp19792 = getelementptr inbounds float* %tmp19791, i64 1
+  %tmp19793 = getelementptr inbounds float* %tmp19792, i64 1
+  %tmp19794 = getelementptr inbounds float* %tmp19793, i64 1
+  %tmp19795 = getelementptr inbounds float* %tmp19794, i64 1
+  %tmp19796 = getelementptr inbounds float* %tmp19795, i64 1
+  %tmp19797 = getelementptr inbounds float* %tmp19796, i64 1
+  %tmp19798 = getelementptr inbounds float* %tmp19797, i64 1
+  %tmp19799 = getelementptr inbounds float* %tmp19798, i64 1
+  %tmp19800 = getelementptr inbounds float* %tmp19799, i64 1
+  %tmp19801 = getelementptr inbounds float* %tmp19800, i64 1
+  %tmp19802 = getelementptr inbounds float* %tmp19801, i64 1
+  %tmp19803 = getelementptr inbounds float* %tmp19802, i64 1
+  %tmp19804 = getelementptr inbounds float* %tmp19803, i64 1
+  %tmp19805 = getelementptr inbounds float* %tmp19804, i64 1
+  %tmp19806 = getelementptr inbounds float* %tmp19805, i64 1
+  %tmp19807 = getelementptr inbounds float* %tmp19806, i64 1
+  %tmp19808 = getelementptr inbounds float* %tmp19807, i64 1
+  %tmp19809 = getelementptr inbounds float* %tmp19808, i64 1
+  %tmp19810 = getelementptr inbounds float* %tmp19809, i64 1
+  %tmp19811 = getelementptr inbounds float* %tmp19810, i64 1
+  %tmp19812 = getelementptr inbounds float* %tmp19811, i64 1
+  %tmp19813 = getelementptr inbounds float* %tmp19812, i64 1
+  %tmp19814 = getelementptr inbounds float* %tmp19813, i64 1
+  %tmp19815 = getelementptr inbounds float* %tmp19814, i64 1
+  %tmp19816 = getelementptr inbounds float* %tmp19815, i64 1
+  %tmp19817 = getelementptr inbounds float* %tmp19816, i64 1
+  %tmp19818 = getelementptr inbounds float* %tmp19817, i64 1
+  %tmp19819 = getelementptr inbounds float* %tmp19818, i64 1
+  %tmp19820 = getelementptr inbounds float* %tmp19819, i64 1
+  %tmp19821 = getelementptr inbounds float* %tmp19820, i64 1
+  %tmp19822 = getelementptr inbounds float* %tmp19821, i64 1
+  %tmp19823 = getelementptr inbounds float* %tmp19822, i64 1
+  %tmp19824 = getelementptr inbounds float* %tmp19823, i64 1
+  %tmp19825 = getelementptr inbounds float* %tmp19824, i64 1
+  %tmp19826 = getelementptr inbounds float* %tmp19825, i64 1
+  %tmp19827 = getelementptr inbounds float* %tmp19826, i64 1
+  %tmp19828 = getelementptr inbounds float* %tmp19827, i64 1
+  %tmp19829 = getelementptr inbounds float* %tmp19828, i64 1
+  %tmp19830 = getelementptr inbounds float* %tmp19829, i64 1
+  %tmp19831 = getelementptr inbounds float* %tmp19830, i64 1
+  %tmp19832 = getelementptr inbounds float* %tmp19831, i64 1
+  %tmp19833 = getelementptr inbounds float* %tmp19832, i64 1
+  %tmp19834 = getelementptr inbounds float* %tmp19833, i64 1
+  %tmp19835 = getelementptr inbounds float* %tmp19834, i64 1
+  %tmp19836 = getelementptr inbounds float* %tmp19835, i64 1
+  %tmp19837 = getelementptr inbounds float* %tmp19836, i64 1
+  %tmp19838 = getelementptr inbounds float* %tmp19837, i64 1
+  %tmp19839 = getelementptr inbounds float* %tmp19838, i64 1
+  %tmp19840 = getelementptr inbounds float* %tmp19839, i64 1
+  %tmp19841 = getelementptr inbounds float* %tmp19840, i64 1
+  %tmp19842 = getelementptr inbounds float* %tmp19841, i64 1
+  %tmp19843 = getelementptr inbounds float* %tmp19842, i64 1
+  %tmp19844 = getelementptr inbounds float* %tmp19843, i64 1
+  %tmp19845 = getelementptr inbounds float* %tmp19844, i64 1
+  %tmp19846 = getelementptr inbounds float* %tmp19845, i64 1
+  %tmp19847 = getelementptr inbounds float* %tmp19846, i64 1
+  %tmp19848 = getelementptr inbounds float* %tmp19847, i64 1
+  %tmp19849 = getelementptr inbounds float* %tmp19848, i64 1
+  %tmp19850 = getelementptr inbounds float* %tmp19849, i64 1
+  %tmp19851 = getelementptr inbounds float* %tmp19850, i64 1
+  %tmp19852 = getelementptr inbounds float* %tmp19851, i64 1
+  %tmp19853 = getelementptr inbounds float* %tmp19852, i64 1
+  %tmp19854 = getelementptr inbounds float* %tmp19853, i64 1
+  %tmp19855 = getelementptr inbounds float* %tmp19854, i64 1
+  %tmp19856 = getelementptr inbounds float* %tmp19855, i64 1
+  %tmp19857 = getelementptr inbounds float* %tmp19856, i64 1
+  %tmp19858 = getelementptr inbounds float* %tmp19857, i64 1
+  %tmp19859 = getelementptr inbounds float* %tmp19858, i64 1
+  %tmp19860 = getelementptr inbounds float* %tmp19859, i64 1
+  %tmp19861 = getelementptr inbounds float* %tmp19860, i64 1
+  %tmp19862 = getelementptr inbounds float* %tmp19861, i64 1
+  %tmp19863 = getelementptr inbounds float* %tmp19862, i64 1
+  %tmp19864 = getelementptr inbounds float* %tmp19863, i64 1
+  %tmp19865 = getelementptr inbounds float* %tmp19864, i64 1
+  %tmp19866 = getelementptr inbounds float* %tmp19865, i64 1
+  %tmp19867 = getelementptr inbounds float* %tmp19866, i64 1
+  %tmp19868 = getelementptr inbounds float* %tmp19867, i64 1
+  %tmp19869 = getelementptr inbounds float* %tmp19868, i64 1
+  %tmp19870 = getelementptr inbounds float* %tmp19869, i64 1
+  %tmp19871 = getelementptr inbounds float* %tmp19870, i64 1
+  %tmp19872 = getelementptr inbounds float* %tmp19871, i64 1
+  %tmp19873 = getelementptr inbounds float* %tmp19872, i64 1
+  %tmp19874 = getelementptr inbounds float* %tmp19873, i64 1
+  %tmp19875 = getelementptr inbounds float* %tmp19874, i64 1
+  %tmp19876 = getelementptr inbounds float* %tmp19875, i64 1
+  %tmp19877 = getelementptr inbounds float* %tmp19876, i64 1
+  %tmp19878 = getelementptr inbounds float* %tmp19877, i64 1
+  %tmp19879 = getelementptr inbounds float* %tmp19878, i64 1
+  %tmp19880 = getelementptr inbounds float* %tmp19879, i64 1
+  %tmp19881 = getelementptr inbounds float* %tmp19880, i64 1
+  %tmp19882 = getelementptr inbounds float* %tmp19881, i64 1
+  %tmp19883 = getelementptr inbounds float* %tmp19882, i64 1
+  %tmp19884 = getelementptr inbounds float* %tmp19883, i64 1
+  %tmp19885 = getelementptr inbounds float* %tmp19884, i64 1
+  %tmp19886 = getelementptr inbounds float* %tmp19885, i64 1
+  %tmp19887 = getelementptr inbounds float* %tmp19886, i64 1
+  %tmp19888 = getelementptr inbounds float* %tmp19887, i64 1
+  %tmp19889 = getelementptr inbounds float* %tmp19888, i64 1
+  %tmp19890 = getelementptr inbounds float* %tmp19889, i64 1
+  %tmp19891 = getelementptr inbounds float* %tmp19890, i64 1
+  %tmp19892 = getelementptr inbounds float* %tmp19891, i64 1
+  %tmp19893 = getelementptr inbounds float* %tmp19892, i64 1
+  %tmp19894 = getelementptr inbounds float* %tmp19893, i64 1
+  %tmp19895 = getelementptr inbounds float* %tmp19894, i64 1
+  %tmp19896 = getelementptr inbounds float* %tmp19895, i64 1
+  %tmp19897 = getelementptr inbounds float* %tmp19896, i64 1
+  %tmp19898 = getelementptr inbounds float* %tmp19897, i64 1
+  %tmp19899 = getelementptr inbounds float* %tmp19898, i64 1
+  %tmp19900 = getelementptr inbounds float* %tmp19899, i64 1
+  %tmp19901 = getelementptr inbounds float* %tmp19900, i64 1
+  %tmp19902 = getelementptr inbounds float* %tmp19901, i64 1
+  %tmp19903 = getelementptr inbounds float* %tmp19902, i64 1
+  %tmp19904 = getelementptr inbounds float* %tmp19903, i64 1
+  %tmp19905 = getelementptr inbounds float* %tmp19904, i64 1
+  %tmp19906 = getelementptr inbounds float* %tmp19905, i64 1
+  %tmp19907 = getelementptr inbounds float* %tmp19906, i64 1
+  %tmp19908 = getelementptr inbounds float* %tmp19907, i64 1
+  %tmp19909 = getelementptr inbounds float* %tmp19908, i64 1
+  %tmp19910 = getelementptr inbounds float* %tmp19909, i64 1
+  %tmp19911 = getelementptr inbounds float* %tmp19910, i64 1
+  %tmp19912 = getelementptr inbounds float* %tmp19911, i64 1
+  %tmp19913 = getelementptr inbounds float* %tmp19912, i64 1
+  %tmp19914 = getelementptr inbounds float* %tmp19913, i64 1
+  %tmp19915 = getelementptr inbounds float* %tmp19914, i64 1
+  %tmp19916 = getelementptr inbounds float* %tmp19915, i64 1
+  %tmp19917 = getelementptr inbounds float* %tmp19916, i64 1
+  %tmp19918 = getelementptr inbounds float* %tmp19917, i64 1
+  %tmp19919 = getelementptr inbounds float* %tmp19918, i64 1
+  %tmp19920 = getelementptr inbounds float* %tmp19919, i64 1
+  %tmp19921 = getelementptr inbounds float* %tmp19920, i64 1
+  %tmp19922 = getelementptr inbounds float* %tmp19921, i64 1
+  %tmp19923 = getelementptr inbounds float* %tmp19922, i64 1
+  %tmp19924 = getelementptr inbounds float* %tmp19923, i64 1
+  %tmp19925 = getelementptr inbounds float* %tmp19924, i64 1
+  %tmp19926 = getelementptr inbounds float* %tmp19925, i64 1
+  %tmp19927 = getelementptr inbounds float* %tmp19926, i64 1
+  %tmp19928 = getelementptr inbounds float* %tmp19927, i64 1
+  %tmp19929 = getelementptr inbounds float* %tmp19928, i64 1
+  %tmp19930 = getelementptr inbounds float* %tmp19929, i64 1
+  %tmp19931 = getelementptr inbounds float* %tmp19930, i64 1
+  %tmp19932 = getelementptr inbounds float* %tmp19931, i64 1
+  %tmp19933 = getelementptr inbounds float* %tmp19932, i64 1
+  %tmp19934 = getelementptr inbounds float* %tmp19933, i64 1
+  %tmp19935 = getelementptr inbounds float* %tmp19934, i64 1
+  %tmp19936 = getelementptr inbounds float* %tmp19935, i64 1
+  %tmp19937 = getelementptr inbounds float* %tmp19936, i64 1
+  %tmp19938 = getelementptr inbounds float* %tmp19937, i64 1
+  %tmp19939 = getelementptr inbounds float* %tmp19938, i64 1
+  %tmp19940 = getelementptr inbounds float* %tmp19939, i64 1
+  %tmp19941 = getelementptr inbounds float* %tmp19940, i64 1
+  %tmp19942 = getelementptr inbounds float* %tmp19941, i64 1
+  %tmp19943 = getelementptr inbounds float* %tmp19942, i64 1
+  %tmp19944 = getelementptr inbounds float* %tmp19943, i64 1
+  %tmp19945 = getelementptr inbounds float* %tmp19944, i64 1
+  %tmp19946 = getelementptr inbounds float* %tmp19945, i64 1
+  %tmp19947 = getelementptr inbounds float* %tmp19946, i64 1
+  %tmp19948 = getelementptr inbounds float* %tmp19947, i64 1
+  %tmp19949 = getelementptr inbounds float* %tmp19948, i64 1
+  %tmp19950 = getelementptr inbounds float* %tmp19949, i64 1
+  %tmp19951 = getelementptr inbounds float* %tmp19950, i64 1
+  %tmp19952 = getelementptr inbounds float* %tmp19951, i64 1
+  %tmp19953 = getelementptr inbounds float* %tmp19952, i64 1
+  %tmp19954 = getelementptr inbounds float* %tmp19953, i64 1
+  %tmp19955 = getelementptr inbounds float* %tmp19954, i64 1
+  %tmp19956 = getelementptr inbounds float* %tmp19955, i64 1
+  %tmp19957 = getelementptr inbounds float* %tmp19956, i64 1
+  %tmp19958 = getelementptr inbounds float* %tmp19957, i64 1
+  %tmp19959 = getelementptr inbounds float* %tmp19958, i64 1
+  %tmp19960 = getelementptr inbounds float* %tmp19959, i64 1
+  %tmp19961 = getelementptr inbounds float* %tmp19960, i64 1
+  %tmp19962 = getelementptr inbounds float* %tmp19961, i64 1
+  %tmp19963 = getelementptr inbounds float* %tmp19962, i64 1
+  %tmp19964 = getelementptr inbounds float* %tmp19963, i64 1
+  %tmp19965 = getelementptr inbounds float* %tmp19964, i64 1
+  %tmp19966 = getelementptr inbounds float* %tmp19965, i64 1
+  %tmp19967 = getelementptr inbounds float* %tmp19966, i64 1
+  %tmp19968 = getelementptr inbounds float* %tmp19967, i64 1
+  %tmp19969 = getelementptr inbounds float* %tmp19968, i64 1
+  %tmp19970 = getelementptr inbounds float* %tmp19969, i64 1
+  %tmp19971 = getelementptr inbounds float* %tmp19970, i64 1
+  %tmp19972 = getelementptr inbounds float* %tmp19971, i64 1
+  %tmp19973 = getelementptr inbounds float* %tmp19972, i64 1
+  %tmp19974 = getelementptr inbounds float* %tmp19973, i64 1
+  %tmp19975 = getelementptr inbounds float* %tmp19974, i64 1
+  %tmp19976 = getelementptr inbounds float* %tmp19975, i64 1
+  %tmp19977 = getelementptr inbounds float* %tmp19976, i64 1
+  %tmp19978 = getelementptr inbounds float* %tmp19977, i64 1
+  %tmp19979 = getelementptr inbounds float* %tmp19978, i64 1
+  %tmp19980 = getelementptr inbounds float* %tmp19979, i64 1
+  %tmp19981 = getelementptr inbounds float* %tmp19980, i64 1
+  %tmp19982 = getelementptr inbounds float* %tmp19981, i64 1
+  %tmp19983 = getelementptr inbounds float* %tmp19982, i64 1
+  %tmp19984 = getelementptr inbounds float* %tmp19983, i64 1
+  %tmp19985 = getelementptr inbounds float* %tmp19984, i64 1
+  %tmp19986 = getelementptr inbounds float* %tmp19985, i64 1
+  %tmp19987 = getelementptr inbounds float* %tmp19986, i64 1
+  %tmp19988 = getelementptr inbounds float* %tmp19987, i64 1
+  %tmp19989 = getelementptr inbounds float* %tmp19988, i64 1
+  %tmp19990 = getelementptr inbounds float* %tmp19989, i64 1
+  %tmp19991 = getelementptr inbounds float* %tmp19990, i64 1
+  %tmp19992 = getelementptr inbounds float* %tmp19991, i64 1
+  %tmp19993 = getelementptr inbounds float* %tmp19992, i64 1
+  %tmp19994 = getelementptr inbounds float* %tmp19993, i64 1
+  %tmp19995 = getelementptr inbounds float* %tmp19994, i64 1
+  %tmp19996 = getelementptr inbounds float* %tmp19995, i64 1
+  %tmp19997 = getelementptr inbounds float* %tmp19996, i64 1
+  %tmp19998 = getelementptr inbounds float* %tmp19997, i64 1
+  %tmp19999 = getelementptr inbounds float* %tmp19998, i64 1
+  %tmp20000 = getelementptr inbounds float* %tmp19999, i64 1
+  %tmp20001 = getelementptr inbounds float* %tmp20000, i64 1
+  %tmp20002 = getelementptr inbounds float* %tmp20001, i64 1
+  %tmp20003 = getelementptr inbounds float* %tmp20002, i64 1
+  %tmp20004 = getelementptr inbounds float* %tmp20003, i64 1
+  %tmp20005 = getelementptr inbounds float* %tmp20004, i64 1
+  %tmp20006 = getelementptr inbounds float* %tmp20005, i64 1
+  %tmp20007 = getelementptr inbounds float* %tmp20006, i64 1
+  %tmp20008 = getelementptr inbounds float* %tmp20007, i64 1
+  %tmp20009 = getelementptr inbounds float* %tmp20008, i64 1
+  %tmp20010 = getelementptr inbounds float* %tmp20009, i64 1
+  %tmp20011 = getelementptr inbounds float* %tmp20010, i64 1
+  %tmp20012 = getelementptr inbounds float* %tmp20011, i64 1
+  %tmp20013 = getelementptr inbounds float* %tmp20012, i64 1
+  %tmp20014 = getelementptr inbounds float* %tmp20013, i64 1
+  %tmp20015 = getelementptr inbounds float* %tmp20014, i64 1
+  %tmp20016 = getelementptr inbounds float* %tmp20015, i64 1
+  %tmp20017 = getelementptr inbounds float* %tmp20016, i64 1
+  %tmp20018 = getelementptr inbounds float* %tmp20017, i64 1
+  %tmp20019 = getelementptr inbounds float* %tmp20018, i64 1
+  %tmp20020 = getelementptr inbounds float* %tmp20019, i64 1
+  %tmp20021 = getelementptr inbounds float* %tmp20020, i64 1
+  %tmp20022 = getelementptr inbounds float* %tmp20021, i64 1
+  %tmp20023 = getelementptr inbounds float* %tmp20022, i64 1
+  %tmp20024 = getelementptr inbounds float* %tmp20023, i64 1
+  %tmp20025 = getelementptr inbounds float* %tmp20024, i64 1
+  %tmp20026 = getelementptr inbounds float* %tmp20025, i64 1
+  %tmp20027 = getelementptr inbounds float* %tmp20026, i64 1
+  %tmp20028 = getelementptr inbounds float* %tmp20027, i64 1
+  %tmp20029 = getelementptr inbounds float* %tmp20028, i64 1
+  %tmp20030 = getelementptr inbounds float* %tmp20029, i64 1
+  %tmp20031 = getelementptr inbounds float* %tmp20030, i64 1
+  %tmp20032 = getelementptr inbounds float* %tmp20031, i64 1
+  %tmp20033 = getelementptr inbounds float* %tmp20032, i64 1
+  %tmp20034 = getelementptr inbounds float* %tmp20033, i64 1
+  %tmp20035 = getelementptr inbounds float* %tmp20034, i64 1
+  %tmp20036 = getelementptr inbounds float* %tmp20035, i64 1
+  %tmp20037 = getelementptr inbounds float* %tmp20036, i64 1
+  %tmp20038 = getelementptr inbounds float* %tmp20037, i64 1
+  %tmp20039 = getelementptr inbounds float* %tmp20038, i64 1
+  %tmp20040 = getelementptr inbounds float* %tmp20039, i64 1
+  %tmp20041 = getelementptr inbounds float* %tmp20040, i64 1
+  %tmp20042 = getelementptr inbounds float* %tmp20041, i64 1
+  %tmp20043 = getelementptr inbounds float* %tmp20042, i64 1
+  %tmp20044 = getelementptr inbounds float* %tmp20043, i64 1
+  %tmp20045 = getelementptr inbounds float* %tmp20044, i64 1
+  %tmp20046 = getelementptr inbounds float* %tmp20045, i64 1
+  %tmp20047 = getelementptr inbounds float* %tmp20046, i64 1
+  %tmp20048 = getelementptr inbounds float* %tmp20047, i64 1
+  %tmp20049 = getelementptr inbounds float* %tmp20048, i64 1
+  %tmp20050 = getelementptr inbounds float* %tmp20049, i64 1
+  %tmp20051 = getelementptr inbounds float* %tmp20050, i64 1
+  %tmp20052 = getelementptr inbounds float* %tmp20051, i64 1
+  %tmp20053 = getelementptr inbounds float* %tmp20052, i64 1
+  %tmp20054 = getelementptr inbounds float* %tmp20053, i64 1
+  %tmp20055 = getelementptr inbounds float* %tmp20054, i64 1
+  %tmp20056 = getelementptr inbounds float* %tmp20055, i64 1
+  %tmp20057 = getelementptr inbounds float* %tmp20056, i64 1
+  %tmp20058 = getelementptr inbounds float* %tmp20057, i64 1
+  %tmp20059 = getelementptr inbounds float* %tmp20058, i64 1
+  %tmp20060 = getelementptr inbounds float* %tmp20059, i64 1
+  %tmp20061 = getelementptr inbounds float* %tmp20060, i64 1
+  %tmp20062 = getelementptr inbounds float* %tmp20061, i64 1
+  %tmp20063 = getelementptr inbounds float* %tmp20062, i64 1
+  %tmp20064 = getelementptr inbounds float* %tmp20063, i64 1
+  %tmp20065 = getelementptr inbounds float* %tmp20064, i64 1
+  %tmp20066 = getelementptr inbounds float* %tmp20065, i64 1
+  %tmp20067 = getelementptr inbounds float* %tmp20066, i64 1
+  %tmp20068 = getelementptr inbounds float* %tmp20067, i64 1
+  %tmp20069 = getelementptr inbounds float* %tmp20068, i64 1
+  %tmp20070 = getelementptr inbounds float* %tmp20069, i64 1
+  %tmp20071 = getelementptr inbounds float* %tmp20070, i64 1
+  %tmp20072 = getelementptr inbounds float* %tmp20071, i64 1
+  %tmp20073 = getelementptr inbounds float* %tmp20072, i64 1
+  %tmp20074 = getelementptr inbounds float* %tmp20073, i64 1
+  %tmp20075 = getelementptr inbounds float* %tmp20074, i64 1
+  %tmp20076 = getelementptr inbounds float* %tmp20075, i64 1
+  %tmp20077 = getelementptr inbounds float* %tmp20076, i64 1
+  %tmp20078 = getelementptr inbounds float* %tmp20077, i64 1
+  %tmp20079 = getelementptr inbounds float* %tmp20078, i64 1
+  %tmp20080 = getelementptr inbounds float* %tmp20079, i64 1
+  %tmp20081 = getelementptr inbounds float* %tmp20080, i64 1
+  %tmp20082 = getelementptr inbounds float* %tmp20081, i64 1
+  %tmp20083 = getelementptr inbounds float* %tmp20082, i64 1
+  %tmp20084 = getelementptr inbounds float* %tmp20083, i64 1
+  %tmp20085 = getelementptr inbounds float* %tmp20084, i64 1
+  %tmp20086 = getelementptr inbounds float* %tmp20085, i64 1
+  %tmp20087 = getelementptr inbounds float* %tmp20086, i64 1
+  %tmp20088 = getelementptr inbounds float* %tmp20087, i64 1
+  %tmp20089 = getelementptr inbounds float* %tmp20088, i64 1
+  %tmp20090 = getelementptr inbounds float* %tmp20089, i64 1
+  %tmp20091 = getelementptr inbounds float* %tmp20090, i64 1
+  %tmp20092 = getelementptr inbounds float* %tmp20091, i64 1
+  %tmp20093 = getelementptr inbounds float* %tmp20092, i64 1
+  %tmp20094 = getelementptr inbounds float* %tmp20093, i64 1
+  %tmp20095 = getelementptr inbounds float* %tmp20094, i64 1
+  %tmp20096 = getelementptr inbounds float* %tmp20095, i64 1
+  %tmp20097 = getelementptr inbounds float* %tmp20096, i64 1
+  %tmp20098 = getelementptr inbounds float* %tmp20097, i64 1
+  %tmp20099 = getelementptr inbounds float* %tmp20098, i64 1
+  %tmp20100 = getelementptr inbounds float* %tmp20099, i64 1
+  %tmp20101 = getelementptr inbounds float* %tmp20100, i64 1
+  %tmp20102 = getelementptr inbounds float* %tmp20101, i64 1
+  %tmp20103 = getelementptr inbounds float* %tmp20102, i64 1
+  %tmp20104 = getelementptr inbounds float* %tmp20103, i64 1
+  %tmp20105 = getelementptr inbounds float* %tmp20104, i64 1
+  %tmp20106 = getelementptr inbounds float* %tmp20105, i64 1
+  %tmp20107 = getelementptr inbounds float* %tmp20106, i64 1
+  %tmp20108 = getelementptr inbounds float* %tmp20107, i64 1
+  %tmp20109 = getelementptr inbounds float* %tmp20108, i64 1
+  %tmp20110 = getelementptr inbounds float* %tmp20109, i64 1
+  %tmp20111 = getelementptr inbounds float* %tmp20110, i64 1
+  %tmp20112 = getelementptr inbounds float* %tmp20111, i64 1
+  %tmp20113 = getelementptr inbounds float* %tmp20112, i64 1
+  %tmp20114 = getelementptr inbounds float* %tmp20113, i64 1
+  %tmp20115 = getelementptr inbounds float* %tmp20114, i64 1
+  %tmp20116 = getelementptr inbounds float* %tmp20115, i64 1
+  %tmp20117 = getelementptr inbounds float* %tmp20116, i64 1
+  %tmp20118 = getelementptr inbounds float* %tmp20117, i64 1
+  %tmp20119 = getelementptr inbounds float* %tmp20118, i64 1
+  %tmp20120 = getelementptr inbounds float* %tmp20119, i64 1
+  %tmp20121 = getelementptr inbounds float* %tmp20120, i64 1
+  %tmp20122 = getelementptr inbounds float* %tmp20121, i64 1
+  %tmp20123 = getelementptr inbounds float* %tmp20122, i64 1
+  %tmp20124 = getelementptr inbounds float* %tmp20123, i64 1
+  %tmp20125 = getelementptr inbounds float* %tmp20124, i64 1
+  %tmp20126 = getelementptr inbounds float* %tmp20125, i64 1
+  %tmp20127 = getelementptr inbounds float* %tmp20126, i64 1
+  %tmp20128 = getelementptr inbounds float* %tmp20127, i64 1
+  %tmp20129 = getelementptr inbounds float* %tmp20128, i64 1
+  %tmp20130 = getelementptr inbounds float* %tmp20129, i64 1
+  %tmp20131 = getelementptr inbounds float* %tmp20130, i64 1
+  %tmp20132 = getelementptr inbounds float* %tmp20131, i64 1
+  %tmp20133 = getelementptr inbounds float* %tmp20132, i64 1
+  %tmp20134 = getelementptr inbounds float* %tmp20133, i64 1
+  %tmp20135 = getelementptr inbounds float* %tmp20134, i64 1
+  %tmp20136 = getelementptr inbounds float* %tmp20135, i64 1
+  %tmp20137 = getelementptr inbounds float* %tmp20136, i64 1
+  %tmp20138 = getelementptr inbounds float* %tmp20137, i64 1
+  %tmp20139 = getelementptr inbounds float* %tmp20138, i64 1
+  %tmp20140 = getelementptr inbounds float* %tmp20139, i64 1
+  %tmp20141 = getelementptr inbounds float* %tmp20140, i64 1
+  %tmp20142 = getelementptr inbounds float* %tmp20141, i64 1
+  %tmp20143 = getelementptr inbounds float* %tmp20142, i64 1
+  %tmp20144 = getelementptr inbounds float* %tmp20143, i64 1
+  %tmp20145 = getelementptr inbounds float* %tmp20144, i64 1
+  %tmp20146 = getelementptr inbounds float* %tmp20145, i64 1
+  %tmp20147 = getelementptr inbounds float* %tmp20146, i64 1
+  %tmp20148 = getelementptr inbounds float* %tmp20147, i64 1
+  %tmp20149 = getelementptr inbounds float* %tmp20148, i64 1
+  %tmp20150 = getelementptr inbounds float* %tmp20149, i64 1
+  %tmp20151 = getelementptr inbounds float* %tmp20150, i64 1
+  %tmp20152 = getelementptr inbounds float* %tmp20151, i64 1
+  %tmp20153 = getelementptr inbounds float* %tmp20152, i64 1
+  %tmp20154 = getelementptr inbounds float* %tmp20153, i64 1
+  %tmp20155 = getelementptr inbounds float* %tmp20154, i64 1
+  %tmp20156 = getelementptr inbounds float* %tmp20155, i64 1
+  %tmp20157 = getelementptr inbounds float* %tmp20156, i64 1
+  %tmp20158 = getelementptr inbounds float* %tmp20157, i64 1
+  %tmp20159 = getelementptr inbounds float* %tmp20158, i64 1
+  %tmp20160 = getelementptr inbounds float* %tmp20159, i64 1
+  %tmp20161 = getelementptr inbounds float* %tmp20160, i64 1
+  %tmp20162 = getelementptr inbounds float* %tmp20161, i64 1
+  %tmp20163 = getelementptr inbounds float* %tmp20162, i64 1
+  %tmp20164 = getelementptr inbounds float* %tmp20163, i64 1
+  %tmp20165 = getelementptr inbounds float* %tmp20164, i64 1
+  %tmp20166 = getelementptr inbounds float* %tmp20165, i64 1
+  %tmp20167 = getelementptr inbounds float* %tmp20166, i64 1
+  %tmp20168 = getelementptr inbounds float* %tmp20167, i64 1
+  %tmp20169 = getelementptr inbounds float* %tmp20168, i64 1
+  %tmp20170 = getelementptr inbounds float* %tmp20169, i64 1
+  %tmp20171 = getelementptr inbounds float* %tmp20170, i64 1
+  %tmp20172 = getelementptr inbounds float* %tmp20171, i64 1
+  %tmp20173 = getelementptr inbounds float* %tmp20172, i64 1
+  %tmp20174 = getelementptr inbounds float* %tmp20173, i64 1
+  %tmp20175 = getelementptr inbounds float* %tmp20174, i64 1
+  %tmp20176 = getelementptr inbounds float* %tmp20175, i64 1
+  %tmp20177 = getelementptr inbounds float* %tmp20176, i64 1
+  %tmp20178 = getelementptr inbounds float* %tmp20177, i64 1
+  %tmp20179 = getelementptr inbounds float* %tmp20178, i64 1
+  %tmp20180 = getelementptr inbounds float* %tmp20179, i64 1
+  %tmp20181 = getelementptr inbounds float* %tmp20180, i64 1
+  %tmp20182 = getelementptr inbounds float* %tmp20181, i64 1
+  %tmp20183 = getelementptr inbounds float* %tmp20182, i64 1
+  %tmp20184 = getelementptr inbounds float* %tmp20183, i64 1
+  %tmp20185 = getelementptr inbounds float* %tmp20184, i64 1
+  %tmp20186 = getelementptr inbounds float* %tmp20185, i64 1
+  %tmp20187 = getelementptr inbounds float* %tmp20186, i64 1
+  %tmp20188 = getelementptr inbounds float* %tmp20187, i64 1
+  %tmp20189 = getelementptr inbounds float* %tmp20188, i64 1
+  %tmp20190 = getelementptr inbounds float* %tmp20189, i64 1
+  %tmp20191 = getelementptr inbounds float* %tmp20190, i64 1
+  %tmp20192 = getelementptr inbounds float* %tmp20191, i64 1
+  %tmp20193 = getelementptr inbounds float* %tmp20192, i64 1
+  %tmp20194 = getelementptr inbounds float* %tmp20193, i64 1
+  %tmp20195 = getelementptr inbounds float* %tmp20194, i64 1
+  %tmp20196 = getelementptr inbounds float* %tmp20195, i64 1
+  %tmp20197 = getelementptr inbounds float* %tmp20196, i64 1
+  %tmp20198 = getelementptr inbounds float* %tmp20197, i64 1
+  %tmp20199 = getelementptr inbounds float* %tmp20198, i64 1
+  %tmp20200 = getelementptr inbounds float* %tmp20199, i64 1
+  %tmp20201 = getelementptr inbounds float* %tmp20200, i64 1
+  %tmp20202 = getelementptr inbounds float* %tmp20201, i64 1
+  %tmp20203 = getelementptr inbounds float* %tmp20202, i64 1
+  %tmp20204 = getelementptr inbounds float* %tmp20203, i64 1
+  %tmp20205 = getelementptr inbounds float* %tmp20204, i64 1
+  %tmp20206 = getelementptr inbounds float* %tmp20205, i64 1
+  %tmp20207 = getelementptr inbounds float* %tmp20206, i64 1
+  %tmp20208 = getelementptr inbounds float* %tmp20207, i64 1
+  %tmp20209 = getelementptr inbounds float* %tmp20208, i64 1
+  %tmp20210 = getelementptr inbounds float* %tmp20209, i64 1
+  %tmp20211 = getelementptr inbounds float* %tmp20210, i64 1
+  %tmp20212 = getelementptr inbounds float* %tmp20211, i64 1
+  %tmp20213 = getelementptr inbounds float* %tmp20212, i64 1
+  %tmp20214 = getelementptr inbounds float* %tmp20213, i64 1
+  %tmp20215 = getelementptr inbounds float* %tmp20214, i64 1
+  %tmp20216 = getelementptr inbounds float* %tmp20215, i64 1
+  %tmp20217 = getelementptr inbounds float* %tmp20216, i64 1
+  %tmp20218 = getelementptr inbounds float* %tmp20217, i64 1
+  %tmp20219 = getelementptr inbounds float* %tmp20218, i64 1
+  %tmp20220 = getelementptr inbounds float* %tmp20219, i64 1
+  %tmp20221 = getelementptr inbounds float* %tmp20220, i64 1
+  %tmp20222 = getelementptr inbounds float* %tmp20221, i64 1
+  %tmp20223 = getelementptr inbounds float* %tmp20222, i64 1
+  %tmp20224 = getelementptr inbounds float* %tmp20223, i64 1
+  %tmp20225 = getelementptr inbounds float* %tmp20224, i64 1
+  %tmp20226 = getelementptr inbounds float* %tmp20225, i64 1
+  %tmp20227 = getelementptr inbounds float* %tmp20226, i64 1
+  %tmp20228 = getelementptr inbounds float* %tmp20227, i64 1
+  %tmp20229 = getelementptr inbounds float* %tmp20228, i64 1
+  %tmp20230 = getelementptr inbounds float* %tmp20229, i64 1
+  %tmp20231 = getelementptr inbounds float* %tmp20230, i64 1
+  %tmp20232 = getelementptr inbounds float* %tmp20231, i64 1
+  %tmp20233 = getelementptr inbounds float* %tmp20232, i64 1
+  %tmp20234 = getelementptr inbounds float* %tmp20233, i64 1
+  %tmp20235 = getelementptr inbounds float* %tmp20234, i64 1
+  %tmp20236 = getelementptr inbounds float* %tmp20235, i64 1
+  %tmp20237 = getelementptr inbounds float* %tmp20236, i64 1
+  %tmp20238 = getelementptr inbounds float* %tmp20237, i64 1
+  %tmp20239 = getelementptr inbounds float* %tmp20238, i64 1
+  %tmp20240 = getelementptr inbounds float* %tmp20239, i64 1
+  %tmp20241 = getelementptr inbounds float* %tmp20240, i64 1
+  %tmp20242 = getelementptr inbounds float* %tmp20241, i64 1
+  %tmp20243 = getelementptr inbounds float* %tmp20242, i64 1
+  %tmp20244 = getelementptr inbounds float* %tmp20243, i64 1
+  %tmp20245 = getelementptr inbounds float* %tmp20244, i64 1
+  %tmp20246 = getelementptr inbounds float* %tmp20245, i64 1
+  %tmp20247 = getelementptr inbounds float* %tmp20246, i64 1
+  %tmp20248 = getelementptr inbounds float* %tmp20247, i64 1
+  %tmp20249 = getelementptr inbounds float* %tmp20248, i64 1
+  %tmp20250 = getelementptr inbounds float* %tmp20249, i64 1
+  %tmp20251 = getelementptr inbounds float* %tmp20250, i64 1
+  %tmp20252 = getelementptr inbounds float* %tmp20251, i64 1
+  %tmp20253 = getelementptr inbounds float* %tmp20252, i64 1
+  %tmp20254 = getelementptr inbounds float* %tmp20253, i64 1
+  %tmp20255 = getelementptr inbounds float* %tmp20254, i64 1
+  %tmp20256 = getelementptr inbounds float* %tmp20255, i64 1
+  %tmp20257 = getelementptr inbounds float* %tmp20256, i64 1
+  %tmp20258 = getelementptr inbounds float* %tmp20257, i64 1
+  %tmp20259 = getelementptr inbounds float* %tmp20258, i64 1
+  %tmp20260 = getelementptr inbounds float* %tmp20259, i64 1
+  %tmp20261 = getelementptr inbounds float* %tmp20260, i64 1
+  %tmp20262 = getelementptr inbounds float* %tmp20261, i64 1
+  %tmp20263 = getelementptr inbounds float* %tmp20262, i64 1
+  %tmp20264 = getelementptr inbounds float* %tmp20263, i64 1
+  %tmp20265 = getelementptr inbounds float* %tmp20264, i64 1
+  %tmp20266 = getelementptr inbounds float* %tmp20265, i64 1
+  %tmp20267 = getelementptr inbounds float* %tmp20266, i64 1
+  %tmp20268 = getelementptr inbounds float* %tmp20267, i64 1
+  %tmp20269 = getelementptr inbounds float* %tmp20268, i64 1
+  %tmp20270 = getelementptr inbounds float* %tmp20269, i64 1
+  %tmp20271 = getelementptr inbounds float* %tmp20270, i64 1
+  %tmp20272 = getelementptr inbounds float* %tmp20271, i64 1
+  %tmp20273 = getelementptr inbounds float* %tmp20272, i64 1
+  %tmp20274 = getelementptr inbounds float* %tmp20273, i64 1
+  %tmp20275 = getelementptr inbounds float* %tmp20274, i64 1
+  %tmp20276 = getelementptr inbounds float* %tmp20275, i64 1
+  %tmp20277 = getelementptr inbounds float* %tmp20276, i64 1
+  %tmp20278 = getelementptr inbounds float* %tmp20277, i64 1
+  %tmp20279 = getelementptr inbounds float* %tmp20278, i64 1
+  %tmp20280 = getelementptr inbounds float* %tmp20279, i64 1
+  %tmp20281 = getelementptr inbounds float* %tmp20280, i64 1
+  %tmp20282 = getelementptr inbounds float* %tmp20281, i64 1
+  %tmp20283 = getelementptr inbounds float* %tmp20282, i64 1
+  %tmp20284 = getelementptr inbounds float* %tmp20283, i64 1
+  %tmp20285 = getelementptr inbounds float* %tmp20284, i64 1
+  %tmp20286 = getelementptr inbounds float* %tmp20285, i64 1
+  %tmp20287 = getelementptr inbounds float* %tmp20286, i64 1
+  %tmp20288 = getelementptr inbounds float* %tmp20287, i64 1
+  %tmp20289 = getelementptr inbounds float* %tmp20288, i64 1
+  %tmp20290 = getelementptr inbounds float* %tmp20289, i64 1
+  %tmp20291 = getelementptr inbounds float* %tmp20290, i64 1
+  %tmp20292 = getelementptr inbounds float* %tmp20291, i64 1
+  %tmp20293 = getelementptr inbounds float* %tmp20292, i64 1
+  %tmp20294 = getelementptr inbounds float* %tmp20293, i64 1
+  %tmp20295 = getelementptr inbounds float* %tmp20294, i64 1
+  %tmp20296 = getelementptr inbounds float* %tmp20295, i64 1
+  %tmp20297 = getelementptr inbounds float* %tmp20296, i64 1
+  %tmp20298 = getelementptr inbounds float* %tmp20297, i64 1
+  %tmp20299 = getelementptr inbounds float* %tmp20298, i64 1
+  %tmp20300 = getelementptr inbounds float* %tmp20299, i64 1
+  %tmp20301 = getelementptr inbounds float* %tmp20300, i64 1
+  %tmp20302 = getelementptr inbounds float* %tmp20301, i64 1
+  %tmp20303 = getelementptr inbounds float* %tmp20302, i64 1
+  %tmp20304 = getelementptr inbounds float* %tmp20303, i64 1
+  %tmp20305 = getelementptr inbounds float* %tmp20304, i64 1
+  %tmp20306 = getelementptr inbounds float* %tmp20305, i64 1
+  %tmp20307 = getelementptr inbounds float* %tmp20306, i64 1
+  %tmp20308 = getelementptr inbounds float* %tmp20307, i64 1
+  %tmp20309 = getelementptr inbounds float* %tmp20308, i64 1
+  %tmp20310 = getelementptr inbounds float* %tmp20309, i64 1
+  %tmp20311 = getelementptr inbounds float* %tmp20310, i64 1
+  %tmp20312 = getelementptr inbounds float* %tmp20311, i64 1
+  %tmp20313 = getelementptr inbounds float* %tmp20312, i64 1
+  %tmp20314 = getelementptr inbounds float* %tmp20313, i64 1
+  %tmp20315 = getelementptr inbounds float* %tmp20314, i64 1
+  %tmp20316 = getelementptr inbounds float* %tmp20315, i64 1
+  %tmp20317 = getelementptr inbounds float* %tmp20316, i64 1
+  %tmp20318 = getelementptr inbounds float* %tmp20317, i64 1
+  %tmp20319 = getelementptr inbounds float* %tmp20318, i64 1
+  %tmp20320 = getelementptr inbounds float* %tmp20319, i64 1
+  %tmp20321 = getelementptr inbounds float* %tmp20320, i64 1
+  %tmp20322 = getelementptr inbounds float* %tmp20321, i64 1
+  %tmp20323 = getelementptr inbounds float* %tmp20322, i64 1
+  %tmp20324 = getelementptr inbounds float* %tmp20323, i64 1
+  %tmp20325 = getelementptr inbounds float* %tmp20324, i64 1
+  %tmp20326 = getelementptr inbounds float* %tmp20325, i64 1
+  %tmp20327 = getelementptr inbounds float* %tmp20326, i64 1
+  %tmp20328 = getelementptr inbounds float* %tmp20327, i64 1
+  %tmp20329 = getelementptr inbounds float* %tmp20328, i64 1
+  %tmp20330 = getelementptr inbounds float* %tmp20329, i64 1
+  %tmp20331 = getelementptr inbounds float* %tmp20330, i64 1
+  %tmp20332 = getelementptr inbounds float* %tmp20331, i64 1
+  %tmp20333 = getelementptr inbounds float* %tmp20332, i64 1
+  %tmp20334 = getelementptr inbounds float* %tmp20333, i64 1
+  %tmp20335 = getelementptr inbounds float* %tmp20334, i64 1
+  %tmp20336 = getelementptr inbounds float* %tmp20335, i64 1
+  %tmp20337 = getelementptr inbounds float* %tmp20336, i64 1
+  %tmp20338 = getelementptr inbounds float* %tmp20337, i64 1
+  %tmp20339 = getelementptr inbounds float* %tmp20338, i64 1
+  %tmp20340 = getelementptr inbounds float* %tmp20339, i64 1
+  %tmp20341 = getelementptr inbounds float* %tmp20340, i64 1
+  %tmp20342 = getelementptr inbounds float* %tmp20341, i64 1
+  %tmp20343 = getelementptr inbounds float* %tmp20342, i64 1
+  %tmp20344 = getelementptr inbounds float* %tmp20343, i64 1
+  %tmp20345 = getelementptr inbounds float* %tmp20344, i64 1
+  %tmp20346 = getelementptr inbounds float* %tmp20345, i64 1
+  %tmp20347 = getelementptr inbounds float* %tmp20346, i64 1
+  %tmp20348 = getelementptr inbounds float* %tmp20347, i64 1
+  %tmp20349 = getelementptr inbounds float* %tmp20348, i64 1
+  %tmp20350 = getelementptr inbounds float* %tmp20349, i64 1
+  %tmp20351 = getelementptr inbounds float* %tmp20350, i64 1
+  %tmp20352 = getelementptr inbounds float* %tmp20351, i64 1
+  %tmp20353 = getelementptr inbounds float* %tmp20352, i64 1
+  %tmp20354 = getelementptr inbounds float* %tmp20353, i64 1
+  %tmp20355 = getelementptr inbounds float* %tmp20354, i64 1
+  %tmp20356 = getelementptr inbounds float* %tmp20355, i64 1
+  %tmp20357 = getelementptr inbounds float* %tmp20356, i64 1
+  %tmp20358 = getelementptr inbounds float* %tmp20357, i64 1
+  %tmp20359 = getelementptr inbounds float* %tmp20358, i64 1
+  %tmp20360 = getelementptr inbounds float* %tmp20359, i64 1
+  %tmp20361 = getelementptr inbounds float* %tmp20360, i64 1
+  %tmp20362 = getelementptr inbounds float* %tmp20361, i64 1
+  %tmp20363 = getelementptr inbounds float* %tmp20362, i64 1
+  %tmp20364 = getelementptr inbounds float* %tmp20363, i64 1
+  %tmp20365 = getelementptr inbounds float* %tmp20364, i64 1
+  %tmp20366 = getelementptr inbounds float* %tmp20365, i64 1
+  %tmp20367 = getelementptr inbounds float* %tmp20366, i64 1
+  %tmp20368 = getelementptr inbounds float* %tmp20367, i64 1
+  %tmp20369 = getelementptr inbounds float* %tmp20368, i64 1
+  %tmp20370 = getelementptr inbounds float* %tmp20369, i64 1
+  %tmp20371 = getelementptr inbounds float* %tmp20370, i64 1
+  %tmp20372 = getelementptr inbounds float* %tmp20371, i64 1
+  %tmp20373 = getelementptr inbounds float* %tmp20372, i64 1
+  %tmp20374 = getelementptr inbounds float* %tmp20373, i64 1
+  %tmp20375 = getelementptr inbounds float* %tmp20374, i64 1
+  %tmp20376 = getelementptr inbounds float* %tmp20375, i64 1
+  %tmp20377 = getelementptr inbounds float* %tmp20376, i64 1
+  %tmp20378 = getelementptr inbounds float* %tmp20377, i64 1
+  %tmp20379 = getelementptr inbounds float* %tmp20378, i64 1
+  %tmp20380 = getelementptr inbounds float* %tmp20379, i64 1
+  %tmp20381 = getelementptr inbounds float* %tmp20380, i64 1
+  %tmp20382 = getelementptr inbounds float* %tmp20381, i64 1
+  %tmp20383 = getelementptr inbounds float* %tmp20382, i64 1
+  %tmp20384 = getelementptr inbounds float* %tmp20383, i64 1
+  %tmp20385 = getelementptr inbounds float* %tmp20384, i64 1
+  %tmp20386 = getelementptr inbounds float* %tmp20385, i64 1
+  %tmp20387 = getelementptr inbounds float* %tmp20386, i64 1
+  %tmp20388 = getelementptr inbounds float* %tmp20387, i64 1
+  %tmp20389 = getelementptr inbounds float* %tmp20388, i64 1
+  %tmp20390 = getelementptr inbounds float* %tmp20389, i64 1
+  %tmp20391 = getelementptr inbounds float* %tmp20390, i64 1
+  %tmp20392 = getelementptr inbounds float* %tmp20391, i64 1
+  %tmp20393 = getelementptr inbounds float* %tmp20392, i64 1
+  %tmp20394 = getelementptr inbounds float* %tmp20393, i64 1
+  %tmp20395 = getelementptr inbounds float* %tmp20394, i64 1
+  %tmp20396 = getelementptr inbounds float* %tmp20395, i64 1
+  %tmp20397 = getelementptr inbounds float* %tmp20396, i64 1
+  %tmp20398 = getelementptr inbounds float* %tmp20397, i64 1
+  %tmp20399 = getelementptr inbounds float* %tmp20398, i64 1
+  %tmp20400 = getelementptr inbounds float* %tmp20399, i64 1
+  %tmp20401 = getelementptr inbounds float* %tmp20400, i64 1
+  %tmp20402 = getelementptr inbounds float* %tmp20401, i64 1
+  %tmp20403 = getelementptr inbounds float* %tmp20402, i64 1
+  %tmp20404 = getelementptr inbounds float* %tmp20403, i64 1
+  %tmp20405 = getelementptr inbounds float* %tmp20404, i64 1
+  %tmp20406 = getelementptr inbounds float* %tmp20405, i64 1
+  %tmp20407 = getelementptr inbounds float* %tmp20406, i64 1
+  %tmp20408 = getelementptr inbounds float* %tmp20407, i64 1
+  %tmp20409 = getelementptr inbounds float* %tmp20408, i64 1
+  %tmp20410 = getelementptr inbounds float* %tmp20409, i64 1
+  %tmp20411 = getelementptr inbounds float* %tmp20410, i64 1
+  %tmp20412 = getelementptr inbounds float* %tmp20411, i64 1
+  %tmp20413 = getelementptr inbounds float* %tmp20412, i64 1
+  %tmp20414 = getelementptr inbounds float* %tmp20413, i64 1
+  %tmp20415 = getelementptr inbounds float* %tmp20414, i64 1
+  %tmp20416 = getelementptr inbounds float* %tmp20415, i64 1
+  %tmp20417 = getelementptr inbounds float* %tmp20416, i64 1
+  %tmp20418 = getelementptr inbounds float* %tmp20417, i64 1
+  %tmp20419 = getelementptr inbounds float* %tmp20418, i64 1
+  %tmp20420 = getelementptr inbounds float* %tmp20419, i64 1
+  %tmp20421 = getelementptr inbounds float* %tmp20420, i64 1
+  %tmp20422 = getelementptr inbounds float* %tmp20421, i64 1
+  %tmp20423 = getelementptr inbounds float* %tmp20422, i64 1
+  %tmp20424 = getelementptr inbounds float* %tmp20423, i64 1
+  %tmp20425 = getelementptr inbounds float* %tmp20424, i64 1
+  %tmp20426 = getelementptr inbounds float* %tmp20425, i64 1
+  %tmp20427 = getelementptr inbounds float* %tmp20426, i64 1
+  %tmp20428 = getelementptr inbounds float* %tmp20427, i64 1
+  %tmp20429 = getelementptr inbounds float* %tmp20428, i64 1
+  %tmp20430 = getelementptr inbounds float* %tmp20429, i64 1
+  %tmp20431 = getelementptr inbounds float* %tmp20430, i64 1
+  %tmp20432 = getelementptr inbounds float* %tmp20431, i64 1
+  %tmp20433 = getelementptr inbounds float* %tmp20432, i64 1
+  %tmp20434 = getelementptr inbounds float* %tmp20433, i64 1
+  %tmp20435 = getelementptr inbounds float* %tmp20434, i64 1
+  %tmp20436 = getelementptr inbounds float* %tmp20435, i64 1
+  %tmp20437 = getelementptr inbounds float* %tmp20436, i64 1
+  %tmp20438 = getelementptr inbounds float* %tmp20437, i64 1
+  %tmp20439 = getelementptr inbounds float* %tmp20438, i64 1
+  %tmp20440 = getelementptr inbounds float* %tmp20439, i64 1
+  %tmp20441 = getelementptr inbounds float* %tmp20440, i64 1
+  %tmp20442 = getelementptr inbounds float* %tmp20441, i64 1
+  %tmp20443 = getelementptr inbounds float* %tmp20442, i64 1
+  %tmp20444 = getelementptr inbounds float* %tmp20443, i64 1
+  %tmp20445 = getelementptr inbounds float* %tmp20444, i64 1
+  %tmp20446 = getelementptr inbounds float* %tmp20445, i64 1
+  %tmp20447 = getelementptr inbounds float* %tmp20446, i64 1
+  %tmp20448 = getelementptr inbounds float* %tmp20447, i64 1
+  %tmp20449 = getelementptr inbounds float* %tmp20448, i64 1
+  %tmp20450 = getelementptr inbounds float* %tmp20449, i64 1
+  %tmp20451 = getelementptr inbounds float* %tmp20450, i64 1
+  %tmp20452 = getelementptr inbounds float* %tmp20451, i64 1
+  %tmp20453 = getelementptr inbounds float* %tmp20452, i64 1
+  %tmp20454 = getelementptr inbounds float* %tmp20453, i64 1
+  %tmp20455 = getelementptr inbounds float* %tmp20454, i64 1
+  %tmp20456 = getelementptr inbounds float* %tmp20455, i64 1
+  %tmp20457 = getelementptr inbounds float* %tmp20456, i64 1
+  %tmp20458 = getelementptr inbounds float* %tmp20457, i64 1
+  %tmp20459 = getelementptr inbounds float* %tmp20458, i64 1
+  %tmp20460 = getelementptr inbounds float* %tmp20459, i64 1
+  %tmp20461 = getelementptr inbounds float* %tmp20460, i64 1
+  %tmp20462 = getelementptr inbounds float* %tmp20461, i64 1
+  %tmp20463 = getelementptr inbounds float* %tmp20462, i64 1
+  %tmp20464 = getelementptr inbounds float* %tmp20463, i64 1
+  %tmp20465 = getelementptr inbounds float* %tmp20464, i64 1
+  %tmp20466 = getelementptr inbounds float* %tmp20465, i64 1
+  %tmp20467 = getelementptr inbounds float* %tmp20466, i64 1
+  %tmp20468 = getelementptr inbounds float* %tmp20467, i64 1
+  %tmp20469 = getelementptr inbounds float* %tmp20468, i64 1
+  %tmp20470 = getelementptr inbounds float* %tmp20469, i64 1
+  %tmp20471 = getelementptr inbounds float* %tmp20470, i64 1
+  %tmp20472 = getelementptr inbounds float* %tmp20471, i64 1
+  %tmp20473 = getelementptr inbounds float* %tmp20472, i64 1
+  %tmp20474 = getelementptr inbounds float* %tmp20473, i64 1
+  %tmp20475 = getelementptr inbounds float* %tmp20474, i64 1
+  %tmp20476 = getelementptr inbounds float* %tmp20475, i64 1
+  %tmp20477 = getelementptr inbounds float* %tmp20476, i64 1
+  %tmp20478 = getelementptr inbounds float* %tmp20477, i64 1
+  %tmp20479 = getelementptr inbounds float* %tmp20478, i64 1
+  %tmp20480 = getelementptr inbounds float* %tmp20479, i64 1
+  %tmp20481 = getelementptr inbounds float* %tmp20480, i64 1
+  %tmp20482 = getelementptr inbounds float* %tmp20481, i64 1
+  %tmp20483 = getelementptr inbounds float* %tmp20482, i64 1
+  %tmp20484 = getelementptr inbounds float* %tmp20483, i64 1
+  %tmp20485 = getelementptr inbounds float* %tmp20484, i64 1
+  %tmp20486 = getelementptr inbounds float* %tmp20485, i64 1
+  %tmp20487 = getelementptr inbounds float* %tmp20486, i64 1
+  %tmp20488 = getelementptr inbounds float* %tmp20487, i64 1
+  %tmp20489 = getelementptr inbounds float* %tmp20488, i64 1
+  %tmp20490 = getelementptr inbounds float* %tmp20489, i64 1
+  %tmp20491 = getelementptr inbounds float* %tmp20490, i64 1
+  %tmp20492 = getelementptr inbounds float* %tmp20491, i64 1
+  %tmp20493 = getelementptr inbounds float* %tmp20492, i64 1
+  %tmp20494 = getelementptr inbounds float* %tmp20493, i64 1
+  %tmp20495 = getelementptr inbounds float* %tmp20494, i64 1
+  %tmp20496 = getelementptr inbounds float* %tmp20495, i64 1
+  %tmp20497 = getelementptr inbounds float* %tmp20496, i64 1
+  %tmp20498 = getelementptr inbounds float* %tmp20497, i64 1
+  %tmp20499 = getelementptr inbounds float* %tmp20498, i64 1
+  %tmp20500 = getelementptr inbounds float* %tmp20499, i64 1
+  %tmp20501 = getelementptr inbounds float* %tmp20500, i64 1
+  %tmp20502 = getelementptr inbounds float* %tmp20501, i64 1
+  %tmp20503 = getelementptr inbounds float* %tmp20502, i64 1
+  %tmp20504 = getelementptr inbounds float* %tmp20503, i64 1
+  %tmp20505 = getelementptr inbounds float* %tmp20504, i64 1
+  %tmp20506 = getelementptr inbounds float* %tmp20505, i64 1
+  %tmp20507 = getelementptr inbounds float* %tmp20506, i64 1
+  %tmp20508 = getelementptr inbounds float* %tmp20507, i64 1
+  %tmp20509 = getelementptr inbounds float* %tmp20508, i64 1
+  %tmp20510 = getelementptr inbounds float* %tmp20509, i64 1
+  %tmp20511 = getelementptr inbounds float* %tmp20510, i64 1
+  %tmp20512 = getelementptr inbounds float* %tmp20511, i64 1
+  %tmp20513 = getelementptr inbounds float* %tmp20512, i64 1
+  %tmp20514 = getelementptr inbounds float* %tmp20513, i64 1
+  %tmp20515 = getelementptr inbounds float* %tmp20514, i64 1
+  %tmp20516 = getelementptr inbounds float* %tmp20515, i64 1
+  %tmp20517 = getelementptr inbounds float* %tmp20516, i64 1
+  %tmp20518 = getelementptr inbounds float* %tmp20517, i64 1
+  %tmp20519 = getelementptr inbounds float* %tmp20518, i64 1
+  %tmp20520 = getelementptr inbounds float* %tmp20519, i64 1
+  %tmp20521 = getelementptr inbounds float* %tmp20520, i64 1
+  %tmp20522 = getelementptr inbounds float* %tmp20521, i64 1
+  %tmp20523 = getelementptr inbounds float* %tmp20522, i64 1
+  %tmp20524 = getelementptr inbounds float* %tmp20523, i64 1
+  %tmp20525 = getelementptr inbounds float* %tmp20524, i64 1
+  %tmp20526 = getelementptr inbounds float* %tmp20525, i64 1
+  %tmp20527 = getelementptr inbounds float* %tmp20526, i64 1
+  %tmp20528 = getelementptr inbounds float* %tmp20527, i64 1
+  %tmp20529 = getelementptr inbounds float* %tmp20528, i64 1
+  %tmp20530 = getelementptr inbounds float* %tmp20529, i64 1
+  %tmp20531 = getelementptr inbounds float* %tmp20530, i64 1
+  %tmp20532 = getelementptr inbounds float* %tmp20531, i64 1
+  %tmp20533 = getelementptr inbounds float* %tmp20532, i64 1
+  %tmp20534 = getelementptr inbounds float* %tmp20533, i64 1
+  %tmp20535 = getelementptr inbounds float* %tmp20534, i64 1
+  %tmp20536 = getelementptr inbounds float* %tmp20535, i64 1
+  %tmp20537 = getelementptr inbounds float* %tmp20536, i64 1
+  %tmp20538 = getelementptr inbounds float* %tmp20537, i64 1
+  %tmp20539 = getelementptr inbounds float* %tmp20538, i64 1
+  %tmp20540 = getelementptr inbounds float* %tmp20539, i64 1
+  %tmp20541 = getelementptr inbounds float* %tmp20540, i64 1
+  %tmp20542 = getelementptr inbounds float* %tmp20541, i64 1
+  %tmp20543 = getelementptr inbounds float* %tmp20542, i64 1
+  %tmp20544 = getelementptr inbounds float* %tmp20543, i64 1
+  %tmp20545 = getelementptr inbounds float* %tmp20544, i64 1
+  %tmp20546 = getelementptr inbounds float* %tmp20545, i64 1
+  %tmp20547 = getelementptr inbounds float* %tmp20546, i64 1
+  %tmp20548 = getelementptr inbounds float* %tmp20547, i64 1
+  %tmp20549 = getelementptr inbounds float* %tmp20548, i64 1
+  %tmp20550 = getelementptr inbounds float* %tmp20549, i64 1
+  %tmp20551 = getelementptr inbounds float* %tmp20550, i64 1
+  %tmp20552 = getelementptr inbounds float* %tmp20551, i64 1
+  %tmp20553 = getelementptr inbounds float* %tmp20552, i64 1
+  %tmp20554 = getelementptr inbounds float* %tmp20553, i64 1
+  %tmp20555 = getelementptr inbounds float* %tmp20554, i64 1
+  %tmp20556 = getelementptr inbounds float* %tmp20555, i64 1
+  %tmp20557 = getelementptr inbounds float* %tmp20556, i64 1
+  %tmp20558 = getelementptr inbounds float* %tmp20557, i64 1
+  %tmp20559 = getelementptr inbounds float* %tmp20558, i64 1
+  %tmp20560 = getelementptr inbounds float* %tmp20559, i64 1
+  %tmp20561 = getelementptr inbounds float* %tmp20560, i64 1
+  %tmp20562 = getelementptr inbounds float* %tmp20561, i64 1
+  %tmp20563 = getelementptr inbounds float* %tmp20562, i64 1
+  %tmp20564 = getelementptr inbounds float* %tmp20563, i64 1
+  %tmp20565 = getelementptr inbounds float* %tmp20564, i64 1
+  %tmp20566 = getelementptr inbounds float* %tmp20565, i64 1
+  %tmp20567 = getelementptr inbounds float* %tmp20566, i64 1
+  %tmp20568 = getelementptr inbounds float* %tmp20567, i64 1
+  %tmp20569 = getelementptr inbounds float* %tmp20568, i64 1
+  %tmp20570 = getelementptr inbounds float* %tmp20569, i64 1
+  %tmp20571 = getelementptr inbounds float* %tmp20570, i64 1
+  %tmp20572 = getelementptr inbounds float* %tmp20571, i64 1
+  %tmp20573 = getelementptr inbounds float* %tmp20572, i64 1
+  %tmp20574 = getelementptr inbounds float* %tmp20573, i64 1
+  %tmp20575 = getelementptr inbounds float* %tmp20574, i64 1
+  %tmp20576 = getelementptr inbounds float* %tmp20575, i64 1
+  %tmp20577 = getelementptr inbounds float* %tmp20576, i64 1
+  %tmp20578 = getelementptr inbounds float* %tmp20577, i64 1
+  %tmp20579 = getelementptr inbounds float* %tmp20578, i64 1
+  %tmp20580 = getelementptr inbounds float* %tmp20579, i64 1
+  %tmp20581 = getelementptr inbounds float* %tmp20580, i64 1
+  %tmp20582 = getelementptr inbounds float* %tmp20581, i64 1
+  %tmp20583 = getelementptr inbounds float* %tmp20582, i64 1
+  %tmp20584 = getelementptr inbounds float* %tmp20583, i64 1
+  %tmp20585 = getelementptr inbounds float* %tmp20584, i64 1
+  %tmp20586 = getelementptr inbounds float* %tmp20585, i64 1
+  %tmp20587 = getelementptr inbounds float* %tmp20586, i64 1
+  %tmp20588 = getelementptr inbounds float* %tmp20587, i64 1
+  %tmp20589 = getelementptr inbounds float* %tmp20588, i64 1
+  %tmp20590 = getelementptr inbounds float* %tmp20589, i64 1
+  %tmp20591 = getelementptr inbounds float* %tmp20590, i64 1
+  %tmp20592 = getelementptr inbounds float* %tmp20591, i64 1
+  %tmp20593 = getelementptr inbounds float* %tmp20592, i64 1
+  %tmp20594 = getelementptr inbounds float* %tmp20593, i64 1
+  %tmp20595 = getelementptr inbounds float* %tmp20594, i64 1
+  %tmp20596 = getelementptr inbounds float* %tmp20595, i64 1
+  %tmp20597 = getelementptr inbounds float* %tmp20596, i64 1
+  %tmp20598 = getelementptr inbounds float* %tmp20597, i64 1
+  %tmp20599 = getelementptr inbounds float* %tmp20598, i64 1
+  %tmp20600 = getelementptr inbounds float* %tmp20599, i64 1
+  %tmp20601 = getelementptr inbounds float* %tmp20600, i64 1
+  %tmp20602 = getelementptr inbounds float* %tmp20601, i64 1
+  %tmp20603 = getelementptr inbounds float* %tmp20602, i64 1
+  %tmp20604 = getelementptr inbounds float* %tmp20603, i64 1
+  %tmp20605 = getelementptr inbounds float* %tmp20604, i64 1
+  %tmp20606 = getelementptr inbounds float* %tmp20605, i64 1
+  %tmp20607 = getelementptr inbounds float* %tmp20606, i64 1
+  %tmp20608 = getelementptr inbounds float* %tmp20607, i64 1
+  %tmp20609 = getelementptr inbounds float* %tmp20608, i64 1
+  %tmp20610 = getelementptr inbounds float* %tmp20609, i64 1
+  %tmp20611 = getelementptr inbounds float* %tmp20610, i64 1
+  %tmp20612 = getelementptr inbounds float* %tmp20611, i64 1
+  %tmp20613 = getelementptr inbounds float* %tmp20612, i64 1
+  %tmp20614 = getelementptr inbounds float* %tmp20613, i64 1
+  %tmp20615 = getelementptr inbounds float* %tmp20614, i64 1
+  %tmp20616 = getelementptr inbounds float* %tmp20615, i64 1
+  %tmp20617 = getelementptr inbounds float* %tmp20616, i64 1
+  %tmp20618 = getelementptr inbounds float* %tmp20617, i64 1
+  %tmp20619 = getelementptr inbounds float* %tmp20618, i64 1
+  %tmp20620 = getelementptr inbounds float* %tmp20619, i64 1
+  %tmp20621 = getelementptr inbounds float* %tmp20620, i64 1
+  %tmp20622 = getelementptr inbounds float* %tmp20621, i64 1
+  %tmp20623 = getelementptr inbounds float* %tmp20622, i64 1
+  %tmp20624 = getelementptr inbounds float* %tmp20623, i64 1
+  %tmp20625 = getelementptr inbounds float* %tmp20624, i64 1
+  %tmp20626 = getelementptr inbounds float* %tmp20625, i64 1
+  %tmp20627 = getelementptr inbounds float* %tmp20626, i64 1
+  %tmp20628 = getelementptr inbounds float* %tmp20627, i64 1
+  %tmp20629 = getelementptr inbounds float* %tmp20628, i64 1
+  %tmp20630 = getelementptr inbounds float* %tmp20629, i64 1
+  %tmp20631 = getelementptr inbounds float* %tmp20630, i64 1
+  %tmp20632 = getelementptr inbounds float* %tmp20631, i64 1
+  %tmp20633 = getelementptr inbounds float* %tmp20632, i64 1
+  %tmp20634 = getelementptr inbounds float* %tmp20633, i64 1
+  %tmp20635 = getelementptr inbounds float* %tmp20634, i64 1
+  %tmp20636 = getelementptr inbounds float* %tmp20635, i64 1
+  %tmp20637 = getelementptr inbounds float* %tmp20636, i64 1
+  %tmp20638 = getelementptr inbounds float* %tmp20637, i64 1
+  %tmp20639 = getelementptr inbounds float* %tmp20638, i64 1
+  %tmp20640 = getelementptr inbounds float* %tmp20639, i64 1
+  %tmp20641 = getelementptr inbounds float* %tmp20640, i64 1
+  %tmp20642 = getelementptr inbounds float* %tmp20641, i64 1
+  %tmp20643 = getelementptr inbounds float* %tmp20642, i64 1
+  %tmp20644 = getelementptr inbounds float* %tmp20643, i64 1
+  %tmp20645 = getelementptr inbounds float* %tmp20644, i64 1
+  %tmp20646 = getelementptr inbounds float* %tmp20645, i64 1
+  %tmp20647 = getelementptr inbounds float* %tmp20646, i64 1
+  %tmp20648 = getelementptr inbounds float* %tmp20647, i64 1
+  %tmp20649 = getelementptr inbounds float* %tmp20648, i64 1
+  %tmp20650 = getelementptr inbounds float* %tmp20649, i64 1
+  %tmp20651 = getelementptr inbounds float* %tmp20650, i64 1
+  %tmp20652 = getelementptr inbounds float* %tmp20651, i64 1
+  %tmp20653 = getelementptr inbounds float* %tmp20652, i64 1
+  %tmp20654 = getelementptr inbounds float* %tmp20653, i64 1
+  %tmp20655 = getelementptr inbounds float* %tmp20654, i64 1
+  %tmp20656 = getelementptr inbounds float* %tmp20655, i64 1
+  %tmp20657 = getelementptr inbounds float* %tmp20656, i64 1
+  %tmp20658 = getelementptr inbounds float* %tmp20657, i64 1
+  %tmp20659 = getelementptr inbounds float* %tmp20658, i64 1
+  %tmp20660 = getelementptr inbounds float* %tmp20659, i64 1
+  %tmp20661 = getelementptr inbounds float* %tmp20660, i64 1
+  %tmp20662 = getelementptr inbounds float* %tmp20661, i64 1
+  %tmp20663 = getelementptr inbounds float* %tmp20662, i64 1
+  %tmp20664 = getelementptr inbounds float* %tmp20663, i64 1
+  %tmp20665 = getelementptr inbounds float* %tmp20664, i64 1
+  %tmp20666 = getelementptr inbounds float* %tmp20665, i64 1
+  %tmp20667 = getelementptr inbounds float* %tmp20666, i64 1
+  %tmp20668 = getelementptr inbounds float* %tmp20667, i64 1
+  %tmp20669 = getelementptr inbounds float* %tmp20668, i64 1
+  %tmp20670 = getelementptr inbounds float* %tmp20669, i64 1
+  %tmp20671 = getelementptr inbounds float* %tmp20670, i64 1
+  %tmp20672 = getelementptr inbounds float* %tmp20671, i64 1
+  %tmp20673 = getelementptr inbounds float* %tmp20672, i64 1
+  %tmp20674 = getelementptr inbounds float* %tmp20673, i64 1
+  %tmp20675 = getelementptr inbounds float* %tmp20674, i64 1
+  %tmp20676 = getelementptr inbounds float* %tmp20675, i64 1
+  %tmp20677 = getelementptr inbounds float* %tmp20676, i64 1
+  %tmp20678 = getelementptr inbounds float* %tmp20677, i64 1
+  %tmp20679 = getelementptr inbounds float* %tmp20678, i64 1
+  %tmp20680 = getelementptr inbounds float* %tmp20679, i64 1
+  %tmp20681 = getelementptr inbounds float* %tmp20680, i64 1
+  %tmp20682 = getelementptr inbounds float* %tmp20681, i64 1
+  %tmp20683 = getelementptr inbounds float* %tmp20682, i64 1
+  %tmp20684 = getelementptr inbounds float* %tmp20683, i64 1
+  %tmp20685 = getelementptr inbounds float* %tmp20684, i64 1
+  %tmp20686 = getelementptr inbounds float* %tmp20685, i64 1
+  %tmp20687 = getelementptr inbounds float* %tmp20686, i64 1
+  %tmp20688 = getelementptr inbounds float* %tmp20687, i64 1
+  %tmp20689 = getelementptr inbounds float* %tmp20688, i64 1
+  %tmp20690 = getelementptr inbounds float* %tmp20689, i64 1
+  %tmp20691 = getelementptr inbounds float* %tmp20690, i64 1
+  %tmp20692 = getelementptr inbounds float* %tmp20691, i64 1
+  %tmp20693 = getelementptr inbounds float* %tmp20692, i64 1
+  %tmp20694 = getelementptr inbounds float* %tmp20693, i64 1
+  %tmp20695 = getelementptr inbounds float* %tmp20694, i64 1
+  %tmp20696 = getelementptr inbounds float* %tmp20695, i64 1
+  %tmp20697 = getelementptr inbounds float* %tmp20696, i64 1
+  %tmp20698 = getelementptr inbounds float* %tmp20697, i64 1
+  %tmp20699 = getelementptr inbounds float* %tmp20698, i64 1
+  %tmp20700 = getelementptr inbounds float* %tmp20699, i64 1
+  %tmp20701 = getelementptr inbounds float* %tmp20700, i64 1
+  %tmp20702 = getelementptr inbounds float* %tmp20701, i64 1
+  %tmp20703 = getelementptr inbounds float* %tmp20702, i64 1
+  %tmp20704 = getelementptr inbounds float* %tmp20703, i64 1
+  %tmp20705 = getelementptr inbounds float* %tmp20704, i64 1
+  %tmp20706 = getelementptr inbounds float* %tmp20705, i64 1
+  %tmp20707 = getelementptr inbounds float* %tmp20706, i64 1
+  %tmp20708 = getelementptr inbounds float* %tmp20707, i64 1
+  %tmp20709 = getelementptr inbounds float* %tmp20708, i64 1
+  %tmp20710 = getelementptr inbounds float* %tmp20709, i64 1
+  %tmp20711 = getelementptr inbounds float* %tmp20710, i64 1
+  %tmp20712 = getelementptr inbounds float* %tmp20711, i64 1
+  %tmp20713 = getelementptr inbounds float* %tmp20712, i64 1
+  %tmp20714 = getelementptr inbounds float* %tmp20713, i64 1
+  %tmp20715 = getelementptr inbounds float* %tmp20714, i64 1
+  %tmp20716 = getelementptr inbounds float* %tmp20715, i64 1
+  %tmp20717 = getelementptr inbounds float* %tmp20716, i64 1
+  %tmp20718 = getelementptr inbounds float* %tmp20717, i64 1
+  %tmp20719 = getelementptr inbounds float* %tmp20718, i64 1
+  %tmp20720 = getelementptr inbounds float* %tmp20719, i64 1
+  %tmp20721 = getelementptr inbounds float* %tmp20720, i64 1
+  %tmp20722 = getelementptr inbounds float* %tmp20721, i64 1
+  %tmp20723 = getelementptr inbounds float* %tmp20722, i64 1
+  %tmp20724 = getelementptr inbounds float* %tmp20723, i64 1
+  %tmp20725 = getelementptr inbounds float* %tmp20724, i64 1
+  %tmp20726 = getelementptr inbounds float* %tmp20725, i64 1
+  %tmp20727 = getelementptr inbounds float* %tmp20726, i64 1
+  %tmp20728 = getelementptr inbounds float* %tmp20727, i64 1
+  %tmp20729 = getelementptr inbounds float* %tmp20728, i64 1
+  %tmp20730 = getelementptr inbounds float* %tmp20729, i64 1
+  %tmp20731 = getelementptr inbounds float* %tmp20730, i64 1
+  %tmp20732 = getelementptr inbounds float* %tmp20731, i64 1
+  %tmp20733 = getelementptr inbounds float* %tmp20732, i64 1
+  %tmp20734 = getelementptr inbounds float* %tmp20733, i64 1
+  %tmp20735 = getelementptr inbounds float* %tmp20734, i64 1
+  %tmp20736 = getelementptr inbounds float* %tmp20735, i64 1
+  %tmp20737 = getelementptr inbounds float* %tmp20736, i64 1
+  %tmp20738 = getelementptr inbounds float* %tmp20737, i64 1
+  %tmp20739 = getelementptr inbounds float* %tmp20738, i64 1
+  %tmp20740 = getelementptr inbounds float* %tmp20739, i64 1
+  %tmp20741 = getelementptr inbounds float* %tmp20740, i64 1
+  %tmp20742 = getelementptr inbounds float* %tmp20741, i64 1
+  %tmp20743 = getelementptr inbounds float* %tmp20742, i64 1
+  %tmp20744 = getelementptr inbounds float* %tmp20743, i64 1
+  %tmp20745 = getelementptr inbounds float* %tmp20744, i64 1
+  %tmp20746 = getelementptr inbounds float* %tmp20745, i64 1
+  %tmp20747 = getelementptr inbounds float* %tmp20746, i64 1
+  %tmp20748 = getelementptr inbounds float* %tmp20747, i64 1
+  %tmp20749 = getelementptr inbounds float* %tmp20748, i64 1
+  %tmp20750 = getelementptr inbounds float* %tmp20749, i64 1
+  %tmp20751 = getelementptr inbounds float* %tmp20750, i64 1
+  %tmp20752 = getelementptr inbounds float* %tmp20751, i64 1
+  %tmp20753 = getelementptr inbounds float* %tmp20752, i64 1
+  %tmp20754 = getelementptr inbounds float* %tmp20753, i64 1
+  %tmp20755 = getelementptr inbounds float* %tmp20754, i64 1
+  %tmp20756 = getelementptr inbounds float* %tmp20755, i64 1
+  %tmp20757 = getelementptr inbounds float* %tmp20756, i64 1
+  %tmp20758 = getelementptr inbounds float* %tmp20757, i64 1
+  %tmp20759 = getelementptr inbounds float* %tmp20758, i64 1
+  %tmp20760 = getelementptr inbounds float* %tmp20759, i64 1
+  %tmp20761 = getelementptr inbounds float* %tmp20760, i64 1
+  %tmp20762 = getelementptr inbounds float* %tmp20761, i64 1
+  %tmp20763 = getelementptr inbounds float* %tmp20762, i64 1
+  %tmp20764 = getelementptr inbounds float* %tmp20763, i64 1
+  %tmp20765 = getelementptr inbounds float* %tmp20764, i64 1
+  %tmp20766 = getelementptr inbounds float* %tmp20765, i64 1
+  %tmp20767 = getelementptr inbounds float* %tmp20766, i64 1
+  %tmp20768 = getelementptr inbounds float* %tmp20767, i64 1
+  %tmp20769 = getelementptr inbounds float* %tmp20768, i64 1
+  %tmp20770 = getelementptr inbounds float* %tmp20769, i64 1
+  %tmp20771 = getelementptr inbounds float* %tmp20770, i64 1
+  %tmp20772 = getelementptr inbounds float* %tmp20771, i64 1
+  %tmp20773 = getelementptr inbounds float* %tmp20772, i64 1
+  %tmp20774 = getelementptr inbounds float* %tmp20773, i64 1
+  %tmp20775 = getelementptr inbounds float* %tmp20774, i64 1
+  %tmp20776 = getelementptr inbounds float* %tmp20775, i64 1
+  %tmp20777 = getelementptr inbounds float* %tmp20776, i64 1
+  %tmp20778 = getelementptr inbounds float* %tmp20777, i64 1
+  %tmp20779 = getelementptr inbounds float* %tmp20778, i64 1
+  %tmp20780 = getelementptr inbounds float* %tmp20779, i64 1
+  %tmp20781 = getelementptr inbounds float* %tmp20780, i64 1
+  %tmp20782 = getelementptr inbounds float* %tmp20781, i64 1
+  %tmp20783 = getelementptr inbounds float* %tmp20782, i64 1
+  %tmp20784 = getelementptr inbounds float* %tmp20783, i64 1
+  %tmp20785 = getelementptr inbounds float* %tmp20784, i64 1
+  %tmp20786 = getelementptr inbounds float* %tmp20785, i64 1
+  %tmp20787 = getelementptr inbounds float* %tmp20786, i64 1
+  %tmp20788 = getelementptr inbounds float* %tmp20787, i64 1
+  %tmp20789 = getelementptr inbounds float* %tmp20788, i64 1
+  %tmp20790 = getelementptr inbounds float* %tmp20789, i64 1
+  %tmp20791 = getelementptr inbounds float* %tmp20790, i64 1
+  %tmp20792 = getelementptr inbounds float* %tmp20791, i64 1
+  %tmp20793 = getelementptr inbounds float* %tmp20792, i64 1
+  %tmp20794 = getelementptr inbounds float* %tmp20793, i64 1
+  %tmp20795 = getelementptr inbounds float* %tmp20794, i64 1
+  %tmp20796 = getelementptr inbounds float* %tmp20795, i64 1
+  %tmp20797 = getelementptr inbounds float* %tmp20796, i64 1
+  %tmp20798 = getelementptr inbounds float* %tmp20797, i64 1
+  %tmp20799 = getelementptr inbounds float* %tmp20798, i64 1
+  %tmp20800 = getelementptr inbounds float* %tmp20799, i64 1
+  %tmp20801 = getelementptr inbounds float* %tmp20800, i64 1
+  %tmp20802 = getelementptr inbounds float* %tmp20801, i64 1
+  %tmp20803 = getelementptr inbounds float* %tmp20802, i64 1
+  %tmp20804 = getelementptr inbounds float* %tmp20803, i64 1
+  %tmp20805 = getelementptr inbounds float* %tmp20804, i64 1
+  %tmp20806 = getelementptr inbounds float* %tmp20805, i64 1
+  %tmp20807 = getelementptr inbounds float* %tmp20806, i64 1
+  %tmp20808 = getelementptr inbounds float* %tmp20807, i64 1
+  %tmp20809 = getelementptr inbounds float* %tmp20808, i64 1
+  %tmp20810 = getelementptr inbounds float* %tmp20809, i64 1
+  %tmp20811 = getelementptr inbounds float* %tmp20810, i64 1
+  %tmp20812 = getelementptr inbounds float* %tmp20811, i64 1
+  %tmp20813 = getelementptr inbounds float* %tmp20812, i64 1
+  %tmp20814 = getelementptr inbounds float* %tmp20813, i64 1
+  %tmp20815 = getelementptr inbounds float* %tmp20814, i64 1
+  %tmp20816 = getelementptr inbounds float* %tmp20815, i64 1
+  %tmp20817 = getelementptr inbounds float* %tmp20816, i64 1
+  %tmp20818 = getelementptr inbounds float* %tmp20817, i64 1
+  %tmp20819 = getelementptr inbounds float* %tmp20818, i64 1
+  %tmp20820 = getelementptr inbounds float* %tmp20819, i64 1
+  %tmp20821 = getelementptr inbounds float* %tmp20820, i64 1
+  %tmp20822 = getelementptr inbounds float* %tmp20821, i64 1
+  %tmp20823 = getelementptr inbounds float* %tmp20822, i64 1
+  %tmp20824 = getelementptr inbounds float* %tmp20823, i64 1
+  %tmp20825 = getelementptr inbounds float* %tmp20824, i64 1
+  %tmp20826 = getelementptr inbounds float* %tmp20825, i64 1
+  %tmp20827 = getelementptr inbounds float* %tmp20826, i64 1
+  %tmp20828 = getelementptr inbounds float* %tmp20827, i64 1
+  %tmp20829 = getelementptr inbounds float* %tmp20828, i64 1
+  %tmp20830 = getelementptr inbounds float* %tmp20829, i64 1
+  %tmp20831 = getelementptr inbounds float* %tmp20830, i64 1
+  %tmp20832 = getelementptr inbounds float* %tmp20831, i64 1
+  %tmp20833 = getelementptr inbounds float* %tmp20832, i64 1
+  %tmp20834 = getelementptr inbounds float* %tmp20833, i64 1
+  %tmp20835 = getelementptr inbounds float* %tmp20834, i64 1
+  %tmp20836 = getelementptr inbounds float* %tmp20835, i64 1
+  %tmp20837 = getelementptr inbounds float* %tmp20836, i64 1
+  %tmp20838 = getelementptr inbounds float* %tmp20837, i64 1
+  %tmp20839 = getelementptr inbounds float* %tmp20838, i64 1
+  %tmp20840 = getelementptr inbounds float* %tmp20839, i64 1
+  %tmp20841 = getelementptr inbounds float* %tmp20840, i64 1
+  %tmp20842 = getelementptr inbounds float* %tmp20841, i64 1
+  %tmp20843 = getelementptr inbounds float* %tmp20842, i64 1
+  %tmp20844 = getelementptr inbounds float* %tmp20843, i64 1
+  %tmp20845 = getelementptr inbounds float* %tmp20844, i64 1
+  %tmp20846 = getelementptr inbounds float* %tmp20845, i64 1
+  %tmp20847 = getelementptr inbounds float* %tmp20846, i64 1
+  %tmp20848 = getelementptr inbounds float* %tmp20847, i64 1
+  %tmp20849 = getelementptr inbounds float* %tmp20848, i64 1
+  %tmp20850 = getelementptr inbounds float* %tmp20849, i64 1
+  %tmp20851 = getelementptr inbounds float* %tmp20850, i64 1
+  %tmp20852 = getelementptr inbounds float* %tmp20851, i64 1
+  %tmp20853 = getelementptr inbounds float* %tmp20852, i64 1
+  %tmp20854 = getelementptr inbounds float* %tmp20853, i64 1
+  %tmp20855 = getelementptr inbounds float* %tmp20854, i64 1
+  %tmp20856 = getelementptr inbounds float* %tmp20855, i64 1
+  %tmp20857 = getelementptr inbounds float* %tmp20856, i64 1
+  %tmp20858 = getelementptr inbounds float* %tmp20857, i64 1
+  %tmp20859 = getelementptr inbounds float* %tmp20858, i64 1
+  %tmp20860 = getelementptr inbounds float* %tmp20859, i64 1
+  %tmp20861 = getelementptr inbounds float* %tmp20860, i64 1
+  %tmp20862 = getelementptr inbounds float* %tmp20861, i64 1
+  %tmp20863 = getelementptr inbounds float* %tmp20862, i64 1
+  %tmp20864 = getelementptr inbounds float* %tmp20863, i64 1
+  %tmp20865 = getelementptr inbounds float* %tmp20864, i64 1
+  %tmp20866 = getelementptr inbounds float* %tmp20865, i64 1
+  %tmp20867 = getelementptr inbounds float* %tmp20866, i64 1
+  %tmp20868 = getelementptr inbounds float* %tmp20867, i64 1
+  %tmp20869 = getelementptr inbounds float* %tmp20868, i64 1
+  %tmp20870 = getelementptr inbounds float* %tmp20869, i64 1
+  %tmp20871 = getelementptr inbounds float* %tmp20870, i64 1
+  %tmp20872 = getelementptr inbounds float* %tmp20871, i64 1
+  %tmp20873 = getelementptr inbounds float* %tmp20872, i64 1
+  %tmp20874 = getelementptr inbounds float* %tmp20873, i64 1
+  %tmp20875 = getelementptr inbounds float* %tmp20874, i64 1
+  %tmp20876 = getelementptr inbounds float* %tmp20875, i64 1
+  %tmp20877 = getelementptr inbounds float* %tmp20876, i64 1
+  %tmp20878 = getelementptr inbounds float* %tmp20877, i64 1
+  %tmp20879 = getelementptr inbounds float* %tmp20878, i64 1
+  %tmp20880 = getelementptr inbounds float* %tmp20879, i64 1
+  %tmp20881 = getelementptr inbounds float* %tmp20880, i64 1
+  %tmp20882 = getelementptr inbounds float* %tmp20881, i64 1
+  %tmp20883 = getelementptr inbounds float* %tmp20882, i64 1
+  %tmp20884 = getelementptr inbounds float* %tmp20883, i64 1
+  %tmp20885 = getelementptr inbounds float* %tmp20884, i64 1
+  %tmp20886 = getelementptr inbounds float* %tmp20885, i64 1
+  %tmp20887 = getelementptr inbounds float* %tmp20886, i64 1
+  %tmp20888 = getelementptr inbounds float* %tmp20887, i64 1
+  %tmp20889 = getelementptr inbounds float* %tmp20888, i64 1
+  %tmp20890 = getelementptr inbounds float* %tmp20889, i64 1
+  %tmp20891 = getelementptr inbounds float* %tmp20890, i64 1
+  %tmp20892 = getelementptr inbounds float* %tmp20891, i64 1
+  %tmp20893 = getelementptr inbounds float* %tmp20892, i64 1
+  %tmp20894 = getelementptr inbounds float* %tmp20893, i64 1
+  %tmp20895 = getelementptr inbounds float* %tmp20894, i64 1
+  %tmp20896 = getelementptr inbounds float* %tmp20895, i64 1
+  %tmp20897 = getelementptr inbounds float* %tmp20896, i64 1
+  %tmp20898 = getelementptr inbounds float* %tmp20897, i64 1
+  %tmp20899 = getelementptr inbounds float* %tmp20898, i64 1
+  %tmp20900 = getelementptr inbounds float* %tmp20899, i64 1
+  %tmp20901 = getelementptr inbounds float* %tmp20900, i64 1
+  %tmp20902 = getelementptr inbounds float* %tmp20901, i64 1
+  %tmp20903 = getelementptr inbounds float* %tmp20902, i64 1
+  %tmp20904 = getelementptr inbounds float* %tmp20903, i64 1
+  %tmp20905 = getelementptr inbounds float* %tmp20904, i64 1
+  %tmp20906 = getelementptr inbounds float* %tmp20905, i64 1
+  %tmp20907 = getelementptr inbounds float* %tmp20906, i64 1
+  %tmp20908 = getelementptr inbounds float* %tmp20907, i64 1
+  %tmp20909 = getelementptr inbounds float* %tmp20908, i64 1
+  %tmp20910 = getelementptr inbounds float* %tmp20909, i64 1
+  %tmp20911 = getelementptr inbounds float* %tmp20910, i64 1
+  %tmp20912 = getelementptr inbounds float* %tmp20911, i64 1
+  %tmp20913 = getelementptr inbounds float* %tmp20912, i64 1
+  %tmp20914 = getelementptr inbounds float* %tmp20913, i64 1
+  %tmp20915 = getelementptr inbounds float* %tmp20914, i64 1
+  %tmp20916 = getelementptr inbounds float* %tmp20915, i64 1
+  %tmp20917 = getelementptr inbounds float* %tmp20916, i64 1
+  %tmp20918 = getelementptr inbounds float* %tmp20917, i64 1
+  %tmp20919 = getelementptr inbounds float* %tmp20918, i64 1
+  %tmp20920 = getelementptr inbounds float* %tmp20919, i64 1
+  %tmp20921 = getelementptr inbounds float* %tmp20920, i64 1
+  %tmp20922 = getelementptr inbounds float* %tmp20921, i64 1
+  %tmp20923 = getelementptr inbounds float* %tmp20922, i64 1
+  %tmp20924 = getelementptr inbounds float* %tmp20923, i64 1
+  %tmp20925 = getelementptr inbounds float* %tmp20924, i64 1
+  %tmp20926 = getelementptr inbounds float* %tmp20925, i64 1
+  %tmp20927 = getelementptr inbounds float* %tmp20926, i64 1
+  %tmp20928 = getelementptr inbounds float* %tmp20927, i64 1
+  %tmp20929 = getelementptr inbounds float* %tmp20928, i64 1
+  %tmp20930 = getelementptr inbounds float* %tmp20929, i64 1
+  %tmp20931 = getelementptr inbounds float* %tmp20930, i64 1
+  %tmp20932 = getelementptr inbounds float* %tmp20931, i64 1
+  %tmp20933 = getelementptr inbounds float* %tmp20932, i64 1
+  %tmp20934 = getelementptr inbounds float* %tmp20933, i64 1
+  %tmp20935 = getelementptr inbounds float* %tmp20934, i64 1
+  %tmp20936 = getelementptr inbounds float* %tmp20935, i64 1
+  %tmp20937 = getelementptr inbounds float* %tmp20936, i64 1
+  %tmp20938 = getelementptr inbounds float* %tmp20937, i64 1
+  %tmp20939 = getelementptr inbounds float* %tmp20938, i64 1
+  %tmp20940 = getelementptr inbounds float* %tmp20939, i64 1
+  %tmp20941 = getelementptr inbounds float* %tmp20940, i64 1
+  %tmp20942 = getelementptr inbounds float* %tmp20941, i64 1
+  %tmp20943 = getelementptr inbounds float* %tmp20942, i64 1
+  %tmp20944 = getelementptr inbounds float* %tmp20943, i64 1
+  %tmp20945 = getelementptr inbounds float* %tmp20944, i64 1
+  %tmp20946 = getelementptr inbounds float* %tmp20945, i64 1
+  %tmp20947 = getelementptr inbounds float* %tmp20946, i64 1
+  %tmp20948 = getelementptr inbounds float* %tmp20947, i64 1
+  %tmp20949 = getelementptr inbounds float* %tmp20948, i64 1
+  %tmp20950 = getelementptr inbounds float* %tmp20949, i64 1
+  %tmp20951 = getelementptr inbounds float* %tmp20950, i64 1
+  %tmp20952 = getelementptr inbounds float* %tmp20951, i64 1
+  %tmp20953 = getelementptr inbounds float* %tmp20952, i64 1
+  %tmp20954 = getelementptr inbounds float* %tmp20953, i64 1
+  %tmp20955 = getelementptr inbounds float* %tmp20954, i64 1
+  %tmp20956 = getelementptr inbounds float* %tmp20955, i64 1
+  %tmp20957 = getelementptr inbounds float* %tmp20956, i64 1
+  %tmp20958 = getelementptr inbounds float* %tmp20957, i64 1
+  %tmp20959 = getelementptr inbounds float* %tmp20958, i64 1
+  %tmp20960 = getelementptr inbounds float* %tmp20959, i64 1
+  %tmp20961 = getelementptr inbounds float* %tmp20960, i64 1
+  %tmp20962 = getelementptr inbounds float* %tmp20961, i64 1
+  %tmp20963 = getelementptr inbounds float* %tmp20962, i64 1
+  %tmp20964 = getelementptr inbounds float* %tmp20963, i64 1
+  %tmp20965 = getelementptr inbounds float* %tmp20964, i64 1
+  %tmp20966 = getelementptr inbounds float* %tmp20965, i64 1
+  %tmp20967 = getelementptr inbounds float* %tmp20966, i64 1
+  %tmp20968 = getelementptr inbounds float* %tmp20967, i64 1
+  %tmp20969 = getelementptr inbounds float* %tmp20968, i64 1
+  %tmp20970 = getelementptr inbounds float* %tmp20969, i64 1
+  %tmp20971 = getelementptr inbounds float* %tmp20970, i64 1
+  %tmp20972 = getelementptr inbounds float* %tmp20971, i64 1
+  %tmp20973 = getelementptr inbounds float* %tmp20972, i64 1
+  %tmp20974 = getelementptr inbounds float* %tmp20973, i64 1
+  %tmp20975 = getelementptr inbounds float* %tmp20974, i64 1
+  %tmp20976 = getelementptr inbounds float* %tmp20975, i64 1
+  %tmp20977 = getelementptr inbounds float* %tmp20976, i64 1
+  %tmp20978 = getelementptr inbounds float* %tmp20977, i64 1
+  %tmp20979 = getelementptr inbounds float* %tmp20978, i64 1
+  %tmp20980 = getelementptr inbounds float* %tmp20979, i64 1
+  %tmp20981 = getelementptr inbounds float* %tmp20980, i64 1
+  %tmp20982 = getelementptr inbounds float* %tmp20981, i64 1
+  %tmp20983 = getelementptr inbounds float* %tmp20982, i64 1
+  %tmp20984 = getelementptr inbounds float* %tmp20983, i64 1
+  %tmp20985 = getelementptr inbounds float* %tmp20984, i64 1
+  %tmp20986 = getelementptr inbounds float* %tmp20985, i64 1
+  %tmp20987 = getelementptr inbounds float* %tmp20986, i64 1
+  %tmp20988 = getelementptr inbounds float* %tmp20987, i64 1
+  %tmp20989 = getelementptr inbounds float* %tmp20988, i64 1
+  %tmp20990 = getelementptr inbounds float* %tmp20989, i64 1
+  %tmp20991 = getelementptr inbounds float* %tmp20990, i64 1
+  %tmp20992 = getelementptr inbounds float* %tmp20991, i64 1
+  %tmp20993 = getelementptr inbounds float* %tmp20992, i64 1
+  %tmp20994 = getelementptr inbounds float* %tmp20993, i64 1
+  %tmp20995 = getelementptr inbounds float* %tmp20994, i64 1
+  %tmp20996 = getelementptr inbounds float* %tmp20995, i64 1
+  %tmp20997 = getelementptr inbounds float* %tmp20996, i64 1
+  %tmp20998 = getelementptr inbounds float* %tmp20997, i64 1
+  %tmp20999 = getelementptr inbounds float* %tmp20998, i64 1
+  %tmp21000 = getelementptr inbounds float* %tmp20999, i64 1
+  %tmp21001 = getelementptr inbounds float* %tmp21000, i64 1
+  %tmp21002 = getelementptr inbounds float* %tmp21001, i64 1
+  %tmp21003 = getelementptr inbounds float* %tmp21002, i64 1
+  %tmp21004 = getelementptr inbounds float* %tmp21003, i64 1
+  %tmp21005 = getelementptr inbounds float* %tmp21004, i64 1
+  %tmp21006 = getelementptr inbounds float* %tmp21005, i64 1
+  %tmp21007 = getelementptr inbounds float* %tmp21006, i64 1
+  %tmp21008 = getelementptr inbounds float* %tmp21007, i64 1
+  %tmp21009 = getelementptr inbounds float* %tmp21008, i64 1
+  %tmp21010 = getelementptr inbounds float* %tmp21009, i64 1
+  %tmp21011 = getelementptr inbounds float* %tmp21010, i64 1
+  %tmp21012 = getelementptr inbounds float* %tmp21011, i64 1
+  %tmp21013 = getelementptr inbounds float* %tmp21012, i64 1
+  %tmp21014 = getelementptr inbounds float* %tmp21013, i64 1
+  %tmp21015 = getelementptr inbounds float* %tmp21014, i64 1
+  %tmp21016 = getelementptr inbounds float* %tmp21015, i64 1
+  %tmp21017 = getelementptr inbounds float* %tmp21016, i64 1
+  %tmp21018 = getelementptr inbounds float* %tmp21017, i64 1
+  %tmp21019 = getelementptr inbounds float* %tmp21018, i64 1
+  %tmp21020 = getelementptr inbounds float* %tmp21019, i64 1
+  %tmp21021 = getelementptr inbounds float* %tmp21020, i64 1
+  %tmp21022 = getelementptr inbounds float* %tmp21021, i64 1
+  %tmp21023 = getelementptr inbounds float* %tmp21022, i64 1
+  %tmp21024 = getelementptr inbounds float* %tmp21023, i64 1
+  %tmp21025 = getelementptr inbounds float* %tmp21024, i64 1
+  %tmp21026 = getelementptr inbounds float* %tmp21025, i64 1
+  %tmp21027 = getelementptr inbounds float* %tmp21026, i64 1
+  %tmp21028 = getelementptr inbounds float* %tmp21027, i64 1
+  %tmp21029 = getelementptr inbounds float* %tmp21028, i64 1
+  %tmp21030 = getelementptr inbounds float* %tmp21029, i64 1
+  %tmp21031 = getelementptr inbounds float* %tmp21030, i64 1
+  %tmp21032 = getelementptr inbounds float* %tmp21031, i64 1
+  %tmp21033 = getelementptr inbounds float* %tmp21032, i64 1
+  %tmp21034 = getelementptr inbounds float* %tmp21033, i64 1
+  %tmp21035 = getelementptr inbounds float* %tmp21034, i64 1
+  %tmp21036 = getelementptr inbounds float* %tmp21035, i64 1
+  %tmp21037 = getelementptr inbounds float* %tmp21036, i64 1
+  %tmp21038 = getelementptr inbounds float* %tmp21037, i64 1
+  %tmp21039 = getelementptr inbounds float* %tmp21038, i64 1
+  %tmp21040 = getelementptr inbounds float* %tmp21039, i64 1
+  %tmp21041 = getelementptr inbounds float* %tmp21040, i64 1
+  %tmp21042 = getelementptr inbounds float* %tmp21041, i64 1
+  %tmp21043 = getelementptr inbounds float* %tmp21042, i64 1
+  %tmp21044 = getelementptr inbounds float* %tmp21043, i64 1
+  %tmp21045 = getelementptr inbounds float* %tmp21044, i64 1
+  %tmp21046 = getelementptr inbounds float* %tmp21045, i64 1
+  %tmp21047 = getelementptr inbounds float* %tmp21046, i64 1
+  %tmp21048 = getelementptr inbounds float* %tmp21047, i64 1
+  %tmp21049 = getelementptr inbounds float* %tmp21048, i64 1
+  %tmp21050 = getelementptr inbounds float* %tmp21049, i64 1
+  %tmp21051 = getelementptr inbounds float* %tmp21050, i64 1
+  %tmp21052 = getelementptr inbounds float* %tmp21051, i64 1
+  %tmp21053 = getelementptr inbounds float* %tmp21052, i64 1
+  %tmp21054 = getelementptr inbounds float* %tmp21053, i64 1
+  %tmp21055 = getelementptr inbounds float* %tmp21054, i64 1
+  %tmp21056 = getelementptr inbounds float* %tmp21055, i64 1
+  %tmp21057 = getelementptr inbounds float* %tmp21056, i64 1
+  %tmp21058 = getelementptr inbounds float* %tmp21057, i64 1
+  %tmp21059 = getelementptr inbounds float* %tmp21058, i64 1
+  %tmp21060 = getelementptr inbounds float* %tmp21059, i64 1
+  %tmp21061 = getelementptr inbounds float* %tmp21060, i64 1
+  %tmp21062 = getelementptr inbounds float* %tmp21061, i64 1
+  %tmp21063 = getelementptr inbounds float* %tmp21062, i64 1
+  %tmp21064 = getelementptr inbounds float* %tmp21063, i64 1
+  %tmp21065 = getelementptr inbounds float* %tmp21064, i64 1
+  %tmp21066 = getelementptr inbounds float* %tmp21065, i64 1
+  %tmp21067 = getelementptr inbounds float* %tmp21066, i64 1
+  %tmp21068 = getelementptr inbounds float* %tmp21067, i64 1
+  %tmp21069 = getelementptr inbounds float* %tmp21068, i64 1
+  %tmp21070 = getelementptr inbounds float* %tmp21069, i64 1
+  %tmp21071 = getelementptr inbounds float* %tmp21070, i64 1
+  %tmp21072 = getelementptr inbounds float* %tmp21071, i64 1
+  %tmp21073 = getelementptr inbounds float* %tmp21072, i64 1
+  %tmp21074 = getelementptr inbounds float* %tmp21073, i64 1
+  %tmp21075 = getelementptr inbounds float* %tmp21074, i64 1
+  %tmp21076 = getelementptr inbounds float* %tmp21075, i64 1
+  %tmp21077 = getelementptr inbounds float* %tmp21076, i64 1
+  %tmp21078 = getelementptr inbounds float* %tmp21077, i64 1
+  %tmp21079 = getelementptr inbounds float* %tmp21078, i64 1
+  %tmp21080 = getelementptr inbounds float* %tmp21079, i64 1
+  %tmp21081 = getelementptr inbounds float* %tmp21080, i64 1
+  %tmp21082 = getelementptr inbounds float* %tmp21081, i64 1
+  %tmp21083 = getelementptr inbounds float* %tmp21082, i64 1
+  %tmp21084 = getelementptr inbounds float* %tmp21083, i64 1
+  %tmp21085 = getelementptr inbounds float* %tmp21084, i64 1
+  %tmp21086 = getelementptr inbounds float* %tmp21085, i64 1
+  %tmp21087 = getelementptr inbounds float* %tmp21086, i64 1
+  %tmp21088 = getelementptr inbounds float* %tmp21087, i64 1
+  %tmp21089 = getelementptr inbounds float* %tmp21088, i64 1
+  %tmp21090 = getelementptr inbounds float* %tmp21089, i64 1
+  %tmp21091 = getelementptr inbounds float* %tmp21090, i64 1
+  %tmp21092 = getelementptr inbounds float* %tmp21091, i64 1
+  %tmp21093 = getelementptr inbounds float* %tmp21092, i64 1
+  %tmp21094 = getelementptr inbounds float* %tmp21093, i64 1
+  %tmp21095 = getelementptr inbounds float* %tmp21094, i64 1
+  %tmp21096 = getelementptr inbounds float* %tmp21095, i64 1
+  %tmp21097 = getelementptr inbounds float* %tmp21096, i64 1
+  %tmp21098 = getelementptr inbounds float* %tmp21097, i64 1
+  %tmp21099 = getelementptr inbounds float* %tmp21098, i64 1
+  %tmp21100 = getelementptr inbounds float* %tmp21099, i64 1
+  %tmp21101 = getelementptr inbounds float* %tmp21100, i64 1
+  %tmp21102 = getelementptr inbounds float* %tmp21101, i64 1
+  %tmp21103 = getelementptr inbounds float* %tmp21102, i64 1
+  %tmp21104 = getelementptr inbounds float* %tmp21103, i64 1
+  %tmp21105 = getelementptr inbounds float* %tmp21104, i64 1
+  %tmp21106 = getelementptr inbounds float* %tmp21105, i64 1
+  %tmp21107 = getelementptr inbounds float* %tmp21106, i64 1
+  %tmp21108 = getelementptr inbounds float* %tmp21107, i64 1
+  %tmp21109 = getelementptr inbounds float* %tmp21108, i64 1
+  %tmp21110 = getelementptr inbounds float* %tmp21109, i64 1
+  %tmp21111 = getelementptr inbounds float* %tmp21110, i64 1
+  %tmp21112 = getelementptr inbounds float* %tmp21111, i64 1
+  %tmp21113 = getelementptr inbounds float* %tmp21112, i64 1
+  %tmp21114 = getelementptr inbounds float* %tmp21113, i64 1
+  %tmp21115 = getelementptr inbounds float* %tmp21114, i64 1
+  %tmp21116 = getelementptr inbounds float* %tmp21115, i64 1
+  %tmp21117 = getelementptr inbounds float* %tmp21116, i64 1
+  %tmp21118 = getelementptr inbounds float* %tmp21117, i64 1
+  %tmp21119 = getelementptr inbounds float* %tmp21118, i64 1
+  %tmp21120 = getelementptr inbounds float* %tmp21119, i64 1
+  %tmp21121 = getelementptr inbounds float* %tmp21120, i64 1
+  %tmp21122 = getelementptr inbounds float* %tmp21121, i64 1
+  %tmp21123 = getelementptr inbounds float* %tmp21122, i64 1
+  %tmp21124 = getelementptr inbounds float* %tmp21123, i64 1
+  %tmp21125 = getelementptr inbounds float* %tmp21124, i64 1
+  %tmp21126 = getelementptr inbounds float* %tmp21125, i64 1
+  %tmp21127 = getelementptr inbounds float* %tmp21126, i64 1
+  %tmp21128 = getelementptr inbounds float* %tmp21127, i64 1
+  %tmp21129 = getelementptr inbounds float* %tmp21128, i64 1
+  %tmp21130 = getelementptr inbounds float* %tmp21129, i64 1
+  %tmp21131 = getelementptr inbounds float* %tmp21130, i64 1
+  %tmp21132 = getelementptr inbounds float* %tmp21131, i64 1
+  %tmp21133 = getelementptr inbounds float* %tmp21132, i64 1
+  %tmp21134 = getelementptr inbounds float* %tmp21133, i64 1
+  %tmp21135 = getelementptr inbounds float* %tmp21134, i64 1
+  %tmp21136 = getelementptr inbounds float* %tmp21135, i64 1
+  %tmp21137 = getelementptr inbounds float* %tmp21136, i64 1
+  %tmp21138 = getelementptr inbounds float* %tmp21137, i64 1
+  %tmp21139 = getelementptr inbounds float* %tmp21138, i64 1
+  %tmp21140 = getelementptr inbounds float* %tmp21139, i64 1
+  %tmp21141 = getelementptr inbounds float* %tmp21140, i64 1
+  %tmp21142 = getelementptr inbounds float* %tmp21141, i64 1
+  %tmp21143 = getelementptr inbounds float* %tmp21142, i64 1
+  %tmp21144 = getelementptr inbounds float* %tmp21143, i64 1
+  %tmp21145 = getelementptr inbounds float* %tmp21144, i64 1
+  %tmp21146 = getelementptr inbounds float* %tmp21145, i64 1
+  %tmp21147 = getelementptr inbounds float* %tmp21146, i64 1
+  %tmp21148 = getelementptr inbounds float* %tmp21147, i64 1
+  %tmp21149 = getelementptr inbounds float* %tmp21148, i64 1
+  %tmp21150 = getelementptr inbounds float* %tmp21149, i64 1
+  %tmp21151 = getelementptr inbounds float* %tmp21150, i64 1
+  %tmp21152 = getelementptr inbounds float* %tmp21151, i64 1
+  %tmp21153 = getelementptr inbounds float* %tmp21152, i64 1
+  %tmp21154 = getelementptr inbounds float* %tmp21153, i64 1
+  %tmp21155 = getelementptr inbounds float* %tmp21154, i64 1
+  %tmp21156 = getelementptr inbounds float* %tmp21155, i64 1
+  %tmp21157 = getelementptr inbounds float* %tmp21156, i64 1
+  %tmp21158 = getelementptr inbounds float* %tmp21157, i64 1
+  %tmp21159 = getelementptr inbounds float* %tmp21158, i64 1
+  %tmp21160 = getelementptr inbounds float* %tmp21159, i64 1
+  %tmp21161 = getelementptr inbounds float* %tmp21160, i64 1
+  %tmp21162 = getelementptr inbounds float* %tmp21161, i64 1
+  %tmp21163 = getelementptr inbounds float* %tmp21162, i64 1
+  %tmp21164 = getelementptr inbounds float* %tmp21163, i64 1
+  %tmp21165 = getelementptr inbounds float* %tmp21164, i64 1
+  %tmp21166 = getelementptr inbounds float* %tmp21165, i64 1
+  %tmp21167 = getelementptr inbounds float* %tmp21166, i64 1
+  %tmp21168 = getelementptr inbounds float* %tmp21167, i64 1
+  %tmp21169 = getelementptr inbounds float* %tmp21168, i64 1
+  %tmp21170 = getelementptr inbounds float* %tmp21169, i64 1
+  %tmp21171 = getelementptr inbounds float* %tmp21170, i64 1
+  %tmp21172 = getelementptr inbounds float* %tmp21171, i64 1
+  %tmp21173 = getelementptr inbounds float* %tmp21172, i64 1
+  %tmp21174 = getelementptr inbounds float* %tmp21173, i64 1
+  %tmp21175 = getelementptr inbounds float* %tmp21174, i64 1
+  %tmp21176 = getelementptr inbounds float* %tmp21175, i64 1
+  %tmp21177 = getelementptr inbounds float* %tmp21176, i64 1
+  %tmp21178 = getelementptr inbounds float* %tmp21177, i64 1
+  %tmp21179 = getelementptr inbounds float* %tmp21178, i64 1
+  %tmp21180 = getelementptr inbounds float* %tmp21179, i64 1
+  %tmp21181 = getelementptr inbounds float* %tmp21180, i64 1
+  %tmp21182 = getelementptr inbounds float* %tmp21181, i64 1
+  %tmp21183 = getelementptr inbounds float* %tmp21182, i64 1
+  %tmp21184 = getelementptr inbounds float* %tmp21183, i64 1
+  %tmp21185 = getelementptr inbounds float* %tmp21184, i64 1
+  %tmp21186 = getelementptr inbounds float* %tmp21185, i64 1
+  %tmp21187 = getelementptr inbounds float* %tmp21186, i64 1
+  %tmp21188 = getelementptr inbounds float* %tmp21187, i64 1
+  %tmp21189 = getelementptr inbounds float* %tmp21188, i64 1
+  %tmp21190 = getelementptr inbounds float* %tmp21189, i64 1
+  %tmp21191 = getelementptr inbounds float* %tmp21190, i64 1
+  %tmp21192 = getelementptr inbounds float* %tmp21191, i64 1
+  %tmp21193 = getelementptr inbounds float* %tmp21192, i64 1
+  %tmp21194 = getelementptr inbounds float* %tmp21193, i64 1
+  %tmp21195 = getelementptr inbounds float* %tmp21194, i64 1
+  %tmp21196 = getelementptr inbounds float* %tmp21195, i64 1
+  %tmp21197 = getelementptr inbounds float* %tmp21196, i64 1
+  %tmp21198 = getelementptr inbounds float* %tmp21197, i64 1
+  %tmp21199 = getelementptr inbounds float* %tmp21198, i64 1
+  %tmp21200 = getelementptr inbounds float* %tmp21199, i64 1
+  %tmp21201 = getelementptr inbounds float* %tmp21200, i64 1
+  %tmp21202 = getelementptr inbounds float* %tmp21201, i64 1
+  %tmp21203 = getelementptr inbounds float* %tmp21202, i64 1
+  %tmp21204 = getelementptr inbounds float* %tmp21203, i64 1
+  %tmp21205 = getelementptr inbounds float* %tmp21204, i64 1
+  %tmp21206 = getelementptr inbounds float* %tmp21205, i64 1
+  %tmp21207 = getelementptr inbounds float* %tmp21206, i64 1
+  %tmp21208 = getelementptr inbounds float* %tmp21207, i64 1
+  %tmp21209 = getelementptr inbounds float* %tmp21208, i64 1
+  %tmp21210 = getelementptr inbounds float* %tmp21209, i64 1
+  %tmp21211 = getelementptr inbounds float* %tmp21210, i64 1
+  %tmp21212 = getelementptr inbounds float* %tmp21211, i64 1
+  %tmp21213 = getelementptr inbounds float* %tmp21212, i64 1
+  %tmp21214 = getelementptr inbounds float* %tmp21213, i64 1
+  %tmp21215 = getelementptr inbounds float* %tmp21214, i64 1
+  %tmp21216 = getelementptr inbounds float* %tmp21215, i64 1
+  %tmp21217 = getelementptr inbounds float* %tmp21216, i64 1
+  %tmp21218 = getelementptr inbounds float* %tmp21217, i64 1
+  %tmp21219 = getelementptr inbounds float* %tmp21218, i64 1
+  %tmp21220 = getelementptr inbounds float* %tmp21219, i64 1
+  %tmp21221 = getelementptr inbounds float* %tmp21220, i64 1
+  %tmp21222 = getelementptr inbounds float* %tmp21221, i64 1
+  %tmp21223 = getelementptr inbounds float* %tmp21222, i64 1
+  %tmp21224 = getelementptr inbounds float* %tmp21223, i64 1
+  %tmp21225 = getelementptr inbounds float* %tmp21224, i64 1
+  %tmp21226 = getelementptr inbounds float* %tmp21225, i64 1
+  %tmp21227 = getelementptr inbounds float* %tmp21226, i64 1
+  %tmp21228 = getelementptr inbounds float* %tmp21227, i64 1
+  %tmp21229 = getelementptr inbounds float* %tmp21228, i64 1
+  %tmp21230 = getelementptr inbounds float* %tmp21229, i64 1
+  %tmp21231 = getelementptr inbounds float* %tmp21230, i64 1
+  %tmp21232 = getelementptr inbounds float* %tmp21231, i64 1
+  %tmp21233 = getelementptr inbounds float* %tmp21232, i64 1
+  %tmp21234 = getelementptr inbounds float* %tmp21233, i64 1
+  %tmp21235 = getelementptr inbounds float* %tmp21234, i64 1
+  %tmp21236 = getelementptr inbounds float* %tmp21235, i64 1
+  %tmp21237 = getelementptr inbounds float* %tmp21236, i64 1
+  %tmp21238 = getelementptr inbounds float* %tmp21237, i64 1
+  %tmp21239 = getelementptr inbounds float* %tmp21238, i64 1
+  %tmp21240 = getelementptr inbounds float* %tmp21239, i64 1
+  %tmp21241 = getelementptr inbounds float* %tmp21240, i64 1
+  %tmp21242 = getelementptr inbounds float* %tmp21241, i64 1
+  %tmp21243 = getelementptr inbounds float* %tmp21242, i64 1
+  %tmp21244 = getelementptr inbounds float* %tmp21243, i64 1
+  %tmp21245 = getelementptr inbounds float* %tmp21244, i64 1
+  %tmp21246 = getelementptr inbounds float* %tmp21245, i64 1
+  %tmp21247 = getelementptr inbounds float* %tmp21246, i64 1
+  %tmp21248 = getelementptr inbounds float* %tmp21247, i64 1
+  %tmp21249 = getelementptr inbounds float* %tmp21248, i64 1
+  %tmp21250 = getelementptr inbounds float* %tmp21249, i64 1
+  %tmp21251 = getelementptr inbounds float* %tmp21250, i64 1
+  %tmp21252 = getelementptr inbounds float* %tmp21251, i64 1
+  %tmp21253 = getelementptr inbounds float* %tmp21252, i64 1
+  %tmp21254 = getelementptr inbounds float* %tmp21253, i64 1
+  %tmp21255 = getelementptr inbounds float* %tmp21254, i64 1
+  %tmp21256 = getelementptr inbounds float* %tmp21255, i64 1
+  %tmp21257 = getelementptr inbounds float* %tmp21256, i64 1
+  %tmp21258 = getelementptr inbounds float* %tmp21257, i64 1
+  %tmp21259 = getelementptr inbounds float* %tmp21258, i64 1
+  %tmp21260 = getelementptr inbounds float* %tmp21259, i64 1
+  %tmp21261 = getelementptr inbounds float* %tmp21260, i64 1
+  %tmp21262 = getelementptr inbounds float* %tmp21261, i64 1
+  %tmp21263 = getelementptr inbounds float* %tmp21262, i64 1
+  %tmp21264 = getelementptr inbounds float* %tmp21263, i64 1
+  %tmp21265 = getelementptr inbounds float* %tmp21264, i64 1
+  %tmp21266 = getelementptr inbounds float* %tmp21265, i64 1
+  %tmp21267 = getelementptr inbounds float* %tmp21266, i64 1
+  %tmp21268 = getelementptr inbounds float* %tmp21267, i64 1
+  %tmp21269 = getelementptr inbounds float* %tmp21268, i64 1
+  %tmp21270 = getelementptr inbounds float* %tmp21269, i64 1
+  %tmp21271 = getelementptr inbounds float* %tmp21270, i64 1
+  %tmp21272 = getelementptr inbounds float* %tmp21271, i64 1
+  %tmp21273 = getelementptr inbounds float* %tmp21272, i64 1
+  %tmp21274 = getelementptr inbounds float* %tmp21273, i64 1
+  %tmp21275 = getelementptr inbounds float* %tmp21274, i64 1
+  %tmp21276 = getelementptr inbounds float* %tmp21275, i64 1
+  %tmp21277 = getelementptr inbounds float* %tmp21276, i64 1
+  %tmp21278 = getelementptr inbounds float* %tmp21277, i64 1
+  %tmp21279 = getelementptr inbounds float* %tmp21278, i64 1
+  %tmp21280 = getelementptr inbounds float* %tmp21279, i64 1
+  %tmp21281 = getelementptr inbounds float* %tmp21280, i64 1
+  %tmp21282 = getelementptr inbounds float* %tmp21281, i64 1
+  %tmp21283 = getelementptr inbounds float* %tmp21282, i64 1
+  %tmp21284 = getelementptr inbounds float* %tmp21283, i64 1
+  %tmp21285 = getelementptr inbounds float* %tmp21284, i64 1
+  %tmp21286 = getelementptr inbounds float* %tmp21285, i64 1
+  %tmp21287 = getelementptr inbounds float* %tmp21286, i64 1
+  %tmp21288 = getelementptr inbounds float* %tmp21287, i64 1
+  %tmp21289 = getelementptr inbounds float* %tmp21288, i64 1
+  %tmp21290 = getelementptr inbounds float* %tmp21289, i64 1
+  %tmp21291 = getelementptr inbounds float* %tmp21290, i64 1
+  %tmp21292 = getelementptr inbounds float* %tmp21291, i64 1
+  %tmp21293 = getelementptr inbounds float* %tmp21292, i64 1
+  %tmp21294 = getelementptr inbounds float* %tmp21293, i64 1
+  %tmp21295 = getelementptr inbounds float* %tmp21294, i64 1
+  %tmp21296 = getelementptr inbounds float* %tmp21295, i64 1
+  %tmp21297 = getelementptr inbounds float* %tmp21296, i64 1
+  %tmp21298 = getelementptr inbounds float* %tmp21297, i64 1
+  %tmp21299 = getelementptr inbounds float* %tmp21298, i64 1
+  %tmp21300 = getelementptr inbounds float* %tmp21299, i64 1
+  %tmp21301 = getelementptr inbounds float* %tmp21300, i64 1
+  %tmp21302 = getelementptr inbounds float* %tmp21301, i64 1
+  %tmp21303 = getelementptr inbounds float* %tmp21302, i64 1
+  %tmp21304 = getelementptr inbounds float* %tmp21303, i64 1
+  %tmp21305 = getelementptr inbounds float* %tmp21304, i64 1
+  %tmp21306 = getelementptr inbounds float* %tmp21305, i64 1
+  %tmp21307 = getelementptr inbounds float* %tmp21306, i64 1
+  %tmp21308 = getelementptr inbounds float* %tmp21307, i64 1
+  %tmp21309 = getelementptr inbounds float* %tmp21308, i64 1
+  %tmp21310 = getelementptr inbounds float* %tmp21309, i64 1
+  %tmp21311 = getelementptr inbounds float* %tmp21310, i64 1
+  %tmp21312 = getelementptr inbounds float* %tmp21311, i64 1
+  %tmp21313 = getelementptr inbounds float* %tmp21312, i64 1
+  %tmp21314 = getelementptr inbounds float* %tmp21313, i64 1
+  %tmp21315 = getelementptr inbounds float* %tmp21314, i64 1
+  %tmp21316 = getelementptr inbounds float* %tmp21315, i64 1
+  %tmp21317 = getelementptr inbounds float* %tmp21316, i64 1
+  %tmp21318 = getelementptr inbounds float* %tmp21317, i64 1
+  %tmp21319 = getelementptr inbounds float* %tmp21318, i64 1
+  %tmp21320 = getelementptr inbounds float* %tmp21319, i64 1
+  %tmp21321 = getelementptr inbounds float* %tmp21320, i64 1
+  %tmp21322 = getelementptr inbounds float* %tmp21321, i64 1
+  %tmp21323 = getelementptr inbounds float* %tmp21322, i64 1
+  %tmp21324 = getelementptr inbounds float* %tmp21323, i64 1
+  %tmp21325 = getelementptr inbounds float* %tmp21324, i64 1
+  %tmp21326 = getelementptr inbounds float* %tmp21325, i64 1
+  %tmp21327 = getelementptr inbounds float* %tmp21326, i64 1
+  %tmp21328 = getelementptr inbounds float* %tmp21327, i64 1
+  %tmp21329 = getelementptr inbounds float* %tmp21328, i64 1
+  %tmp21330 = getelementptr inbounds float* %tmp21329, i64 1
+  %tmp21331 = getelementptr inbounds float* %tmp21330, i64 1
+  %tmp21332 = getelementptr inbounds float* %tmp21331, i64 1
+  %tmp21333 = getelementptr inbounds float* %tmp21332, i64 1
+  %tmp21334 = getelementptr inbounds float* %tmp21333, i64 1
+  %tmp21335 = getelementptr inbounds float* %tmp21334, i64 1
+  %tmp21336 = getelementptr inbounds float* %tmp21335, i64 1
+  %tmp21337 = getelementptr inbounds float* %tmp21336, i64 1
+  %tmp21338 = getelementptr inbounds float* %tmp21337, i64 1
+  %tmp21339 = getelementptr inbounds float* %tmp21338, i64 1
+  %tmp21340 = getelementptr inbounds float* %tmp21339, i64 1
+  %tmp21341 = getelementptr inbounds float* %tmp21340, i64 1
+  %tmp21342 = getelementptr inbounds float* %tmp21341, i64 1
+  %tmp21343 = getelementptr inbounds float* %tmp21342, i64 1
+  %tmp21344 = getelementptr inbounds float* %tmp21343, i64 1
+  %tmp21345 = getelementptr inbounds float* %tmp21344, i64 1
+  %tmp21346 = getelementptr inbounds float* %tmp21345, i64 1
+  %tmp21347 = getelementptr inbounds float* %tmp21346, i64 1
+  %tmp21348 = getelementptr inbounds float* %tmp21347, i64 1
+  %tmp21349 = getelementptr inbounds float* %tmp21348, i64 1
+  %tmp21350 = getelementptr inbounds float* %tmp21349, i64 1
+  %tmp21351 = getelementptr inbounds float* %tmp21350, i64 1
+  %tmp21352 = getelementptr inbounds float* %tmp21351, i64 1
+  %tmp21353 = getelementptr inbounds float* %tmp21352, i64 1
+  %tmp21354 = getelementptr inbounds float* %tmp21353, i64 1
+  %tmp21355 = getelementptr inbounds float* %tmp21354, i64 1
+  %tmp21356 = getelementptr inbounds float* %tmp21355, i64 1
+  %tmp21357 = getelementptr inbounds float* %tmp21356, i64 1
+  %tmp21358 = getelementptr inbounds float* %tmp21357, i64 1
+  %tmp21359 = getelementptr inbounds float* %tmp21358, i64 1
+  %tmp21360 = getelementptr inbounds float* %tmp21359, i64 1
+  %tmp21361 = getelementptr inbounds float* %tmp21360, i64 1
+  %tmp21362 = getelementptr inbounds float* %tmp21361, i64 1
+  %tmp21363 = getelementptr inbounds float* %tmp21362, i64 1
+  %tmp21364 = getelementptr inbounds float* %tmp21363, i64 1
+  %tmp21365 = getelementptr inbounds float* %tmp21364, i64 1
+  %tmp21366 = getelementptr inbounds float* %tmp21365, i64 1
+  %tmp21367 = getelementptr inbounds float* %tmp21366, i64 1
+  %tmp21368 = getelementptr inbounds float* %tmp21367, i64 1
+  %tmp21369 = getelementptr inbounds float* %tmp21368, i64 1
+  %tmp21370 = getelementptr inbounds float* %tmp21369, i64 1
+  %tmp21371 = getelementptr inbounds float* %tmp21370, i64 1
+  %tmp21372 = getelementptr inbounds float* %tmp21371, i64 1
+  %tmp21373 = getelementptr inbounds float* %tmp21372, i64 1
+  %tmp21374 = getelementptr inbounds float* %tmp21373, i64 1
+  %tmp21375 = getelementptr inbounds float* %tmp21374, i64 1
+  %tmp21376 = getelementptr inbounds float* %tmp21375, i64 1
+  %tmp21377 = getelementptr inbounds float* %tmp21376, i64 1
+  %tmp21378 = getelementptr inbounds float* %tmp21377, i64 1
+  %tmp21379 = getelementptr inbounds float* %tmp21378, i64 1
+  %tmp21380 = getelementptr inbounds float* %tmp21379, i64 1
+  %tmp21381 = getelementptr inbounds float* %tmp21380, i64 1
+  %tmp21382 = getelementptr inbounds float* %tmp21381, i64 1
+  %tmp21383 = getelementptr inbounds float* %tmp21382, i64 1
+  %tmp21384 = getelementptr inbounds float* %tmp21383, i64 1
+  %tmp21385 = getelementptr inbounds float* %tmp21384, i64 1
+  %tmp21386 = getelementptr inbounds float* %tmp21385, i64 1
+  %tmp21387 = getelementptr inbounds float* %tmp21386, i64 1
+  %tmp21388 = getelementptr inbounds float* %tmp21387, i64 1
+  %tmp21389 = getelementptr inbounds float* %tmp21388, i64 1
+  %tmp21390 = getelementptr inbounds float* %tmp21389, i64 1
+  %tmp21391 = getelementptr inbounds float* %tmp21390, i64 1
+  %tmp21392 = getelementptr inbounds float* %tmp21391, i64 1
+  %tmp21393 = getelementptr inbounds float* %tmp21392, i64 1
+  %tmp21394 = getelementptr inbounds float* %tmp21393, i64 1
+  %tmp21395 = getelementptr inbounds float* %tmp21394, i64 1
+  %tmp21396 = getelementptr inbounds float* %tmp21395, i64 1
+  %tmp21397 = getelementptr inbounds float* %tmp21396, i64 1
+  %tmp21398 = getelementptr inbounds float* %tmp21397, i64 1
+  %tmp21399 = getelementptr inbounds float* %tmp21398, i64 1
+  %tmp21400 = getelementptr inbounds float* %tmp21399, i64 1
+  %tmp21401 = getelementptr inbounds float* %tmp21400, i64 1
+  %tmp21402 = getelementptr inbounds float* %tmp21401, i64 1
+  %tmp21403 = getelementptr inbounds float* %tmp21402, i64 1
+  %tmp21404 = getelementptr inbounds float* %tmp21403, i64 1
+  %tmp21405 = getelementptr inbounds float* %tmp21404, i64 1
+  %tmp21406 = getelementptr inbounds float* %tmp21405, i64 1
+  %tmp21407 = getelementptr inbounds float* %tmp21406, i64 1
+  %tmp21408 = getelementptr inbounds float* %tmp21407, i64 1
+  %tmp21409 = getelementptr inbounds float* %tmp21408, i64 1
+  %tmp21410 = getelementptr inbounds float* %tmp21409, i64 1
+  %tmp21411 = getelementptr inbounds float* %tmp21410, i64 1
+  %tmp21412 = getelementptr inbounds float* %tmp21411, i64 1
+  %tmp21413 = getelementptr inbounds float* %tmp21412, i64 1
+  %tmp21414 = getelementptr inbounds float* %tmp21413, i64 1
+  %tmp21415 = getelementptr inbounds float* %tmp21414, i64 1
+  %tmp21416 = getelementptr inbounds float* %tmp21415, i64 1
+  %tmp21417 = getelementptr inbounds float* %tmp21416, i64 1
+  %tmp21418 = getelementptr inbounds float* %tmp21417, i64 1
+  %tmp21419 = getelementptr inbounds float* %tmp21418, i64 1
+  %tmp21420 = getelementptr inbounds float* %tmp21419, i64 1
+  %tmp21421 = getelementptr inbounds float* %tmp21420, i64 1
+  %tmp21422 = getelementptr inbounds float* %tmp21421, i64 1
+  %tmp21423 = getelementptr inbounds float* %tmp21422, i64 1
+  %tmp21424 = getelementptr inbounds float* %tmp21423, i64 1
+  %tmp21425 = getelementptr inbounds float* %tmp21424, i64 1
+  %tmp21426 = getelementptr inbounds float* %tmp21425, i64 1
+  %tmp21427 = getelementptr inbounds float* %tmp21426, i64 1
+  %tmp21428 = getelementptr inbounds float* %tmp21427, i64 1
+  %tmp21429 = getelementptr inbounds float* %tmp21428, i64 1
+  %tmp21430 = getelementptr inbounds float* %tmp21429, i64 1
+  %tmp21431 = getelementptr inbounds float* %tmp21430, i64 1
+  %tmp21432 = getelementptr inbounds float* %tmp21431, i64 1
+  %tmp21433 = getelementptr inbounds float* %tmp21432, i64 1
+  %tmp21434 = getelementptr inbounds float* %tmp21433, i64 1
+  %tmp21435 = getelementptr inbounds float* %tmp21434, i64 1
+  %tmp21436 = getelementptr inbounds float* %tmp21435, i64 1
+  %tmp21437 = getelementptr inbounds float* %tmp21436, i64 1
+  %tmp21438 = getelementptr inbounds float* %tmp21437, i64 1
+  %tmp21439 = getelementptr inbounds float* %tmp21438, i64 1
+  %tmp21440 = getelementptr inbounds float* %tmp21439, i64 1
+  %tmp21441 = getelementptr inbounds float* %tmp21440, i64 1
+  %tmp21442 = getelementptr inbounds float* %tmp21441, i64 1
+  %tmp21443 = getelementptr inbounds float* %tmp21442, i64 1
+  %tmp21444 = getelementptr inbounds float* %tmp21443, i64 1
+  %tmp21445 = getelementptr inbounds float* %tmp21444, i64 1
+  %tmp21446 = getelementptr inbounds float* %tmp21445, i64 1
+  %tmp21447 = getelementptr inbounds float* %tmp21446, i64 1
+  %tmp21448 = getelementptr inbounds float* %tmp21447, i64 1
+  %tmp21449 = getelementptr inbounds float* %tmp21448, i64 1
+  %tmp21450 = getelementptr inbounds float* %tmp21449, i64 1
+  %tmp21451 = getelementptr inbounds float* %tmp21450, i64 1
+  %tmp21452 = getelementptr inbounds float* %tmp21451, i64 1
+  %tmp21453 = getelementptr inbounds float* %tmp21452, i64 1
+  %tmp21454 = getelementptr inbounds float* %tmp21453, i64 1
+  %tmp21455 = getelementptr inbounds float* %tmp21454, i64 1
+  %tmp21456 = getelementptr inbounds float* %tmp21455, i64 1
+  %tmp21457 = getelementptr inbounds float* %tmp21456, i64 1
+  %tmp21458 = getelementptr inbounds float* %tmp21457, i64 1
+  %tmp21459 = getelementptr inbounds float* %tmp21458, i64 1
+  %tmp21460 = getelementptr inbounds float* %tmp21459, i64 1
+  %tmp21461 = getelementptr inbounds float* %tmp21460, i64 1
+  %tmp21462 = getelementptr inbounds float* %tmp21461, i64 1
+  %tmp21463 = getelementptr inbounds float* %tmp21462, i64 1
+  %tmp21464 = getelementptr inbounds float* %tmp21463, i64 1
+  %tmp21465 = getelementptr inbounds float* %tmp21464, i64 1
+  %tmp21466 = getelementptr inbounds float* %tmp21465, i64 1
+  %tmp21467 = getelementptr inbounds float* %tmp21466, i64 1
+  %tmp21468 = getelementptr inbounds float* %tmp21467, i64 1
+  %tmp21469 = getelementptr inbounds float* %tmp21468, i64 1
+  %tmp21470 = getelementptr inbounds float* %tmp21469, i64 1
+  %tmp21471 = getelementptr inbounds float* %tmp21470, i64 1
+  %tmp21472 = getelementptr inbounds float* %tmp21471, i64 1
+  %tmp21473 = getelementptr inbounds float* %tmp21472, i64 1
+  %tmp21474 = getelementptr inbounds float* %tmp21473, i64 1
+  %tmp21475 = getelementptr inbounds float* %tmp21474, i64 1
+  %tmp21476 = getelementptr inbounds float* %tmp21475, i64 1
+  %tmp21477 = getelementptr inbounds float* %tmp21476, i64 1
+  %tmp21478 = getelementptr inbounds float* %tmp21477, i64 1
+  %tmp21479 = getelementptr inbounds float* %tmp21478, i64 1
+  %tmp21480 = getelementptr inbounds float* %tmp21479, i64 1
+  %tmp21481 = getelementptr inbounds float* %tmp21480, i64 1
+  %tmp21482 = getelementptr inbounds float* %tmp21481, i64 1
+  %tmp21483 = getelementptr inbounds float* %tmp21482, i64 1
+  %tmp21484 = getelementptr inbounds float* %tmp21483, i64 1
+  %tmp21485 = getelementptr inbounds float* %tmp21484, i64 1
+  %tmp21486 = getelementptr inbounds float* %tmp21485, i64 1
+  %tmp21487 = getelementptr inbounds float* %tmp21486, i64 1
+  %tmp21488 = getelementptr inbounds float* %tmp21487, i64 1
+  %tmp21489 = getelementptr inbounds float* %tmp21488, i64 1
+  %tmp21490 = getelementptr inbounds float* %tmp21489, i64 1
+  %tmp21491 = getelementptr inbounds float* %tmp21490, i64 1
+  %tmp21492 = getelementptr inbounds float* %tmp21491, i64 1
+  %tmp21493 = getelementptr inbounds float* %tmp21492, i64 1
+  %tmp21494 = getelementptr inbounds float* %tmp21493, i64 1
+  %tmp21495 = getelementptr inbounds float* %tmp21494, i64 1
+  %tmp21496 = getelementptr inbounds float* %tmp21495, i64 1
+  %tmp21497 = getelementptr inbounds float* %tmp21496, i64 1
+  %tmp21498 = getelementptr inbounds float* %tmp21497, i64 1
+  %tmp21499 = getelementptr inbounds float* %tmp21498, i64 1
+  %tmp21500 = getelementptr inbounds float* %tmp21499, i64 1
+  %tmp21501 = getelementptr inbounds float* %tmp21500, i64 1
+  %tmp21502 = getelementptr inbounds float* %tmp21501, i64 1
+  %tmp21503 = getelementptr inbounds float* %tmp21502, i64 1
+  %tmp21504 = getelementptr inbounds float* %tmp21503, i64 1
+  %tmp21505 = getelementptr inbounds float* %tmp21504, i64 1
+  %tmp21506 = getelementptr inbounds float* %tmp21505, i64 1
+  %tmp21507 = getelementptr inbounds float* %tmp21506, i64 1
+  %tmp21508 = getelementptr inbounds float* %tmp21507, i64 1
+  %tmp21509 = getelementptr inbounds float* %tmp21508, i64 1
+  %tmp21510 = getelementptr inbounds float* %tmp21509, i64 1
+  %tmp21511 = getelementptr inbounds float* %tmp21510, i64 1
+  %tmp21512 = getelementptr inbounds float* %tmp21511, i64 1
+  %tmp21513 = getelementptr inbounds float* %tmp21512, i64 1
+  %tmp21514 = getelementptr inbounds float* %tmp21513, i64 1
+  %tmp21515 = getelementptr inbounds float* %tmp21514, i64 1
+  %tmp21516 = getelementptr inbounds float* %tmp21515, i64 1
+  %tmp21517 = getelementptr inbounds float* %tmp21516, i64 1
+  %tmp21518 = getelementptr inbounds float* %tmp21517, i64 1
+  %tmp21519 = getelementptr inbounds float* %tmp21518, i64 1
+  %tmp21520 = getelementptr inbounds float* %tmp21519, i64 1
+  %tmp21521 = getelementptr inbounds float* %tmp21520, i64 1
+  %tmp21522 = getelementptr inbounds float* %tmp21521, i64 1
+  %tmp21523 = getelementptr inbounds float* %tmp21522, i64 1
+  %tmp21524 = getelementptr inbounds float* %tmp21523, i64 1
+  %tmp21525 = getelementptr inbounds float* %tmp21524, i64 1
+  %tmp21526 = getelementptr inbounds float* %tmp21525, i64 1
+  %tmp21527 = getelementptr inbounds float* %tmp21526, i64 1
+  %tmp21528 = getelementptr inbounds float* %tmp21527, i64 1
+  %tmp21529 = getelementptr inbounds float* %tmp21528, i64 1
+  %tmp21530 = getelementptr inbounds float* %tmp21529, i64 1
+  %tmp21531 = getelementptr inbounds float* %tmp21530, i64 1
+  %tmp21532 = getelementptr inbounds float* %tmp21531, i64 1
+  %tmp21533 = getelementptr inbounds float* %tmp21532, i64 1
+  %tmp21534 = getelementptr inbounds float* %tmp21533, i64 1
+  %tmp21535 = getelementptr inbounds float* %tmp21534, i64 1
+  %tmp21536 = getelementptr inbounds float* %tmp21535, i64 1
+  %tmp21537 = getelementptr inbounds float* %tmp21536, i64 1
+  %tmp21538 = getelementptr inbounds float* %tmp21537, i64 1
+  %tmp21539 = getelementptr inbounds float* %tmp21538, i64 1
+  %tmp21540 = getelementptr inbounds float* %tmp21539, i64 1
+  %tmp21541 = getelementptr inbounds float* %tmp21540, i64 1
+  %tmp21542 = getelementptr inbounds float* %tmp21541, i64 1
+  %tmp21543 = getelementptr inbounds float* %tmp21542, i64 1
+  %tmp21544 = getelementptr inbounds float* %tmp21543, i64 1
+  %tmp21545 = getelementptr inbounds float* %tmp21544, i64 1
+  %tmp21546 = getelementptr inbounds float* %tmp21545, i64 1
+  %tmp21547 = getelementptr inbounds float* %tmp21546, i64 1
+  %tmp21548 = getelementptr inbounds float* %tmp21547, i64 1
+  %tmp21549 = getelementptr inbounds float* %tmp21548, i64 1
+  %tmp21550 = getelementptr inbounds float* %tmp21549, i64 1
+  %tmp21551 = getelementptr inbounds float* %tmp21550, i64 1
+  %tmp21552 = getelementptr inbounds float* %tmp21551, i64 1
+  %tmp21553 = getelementptr inbounds float* %tmp21552, i64 1
+  %tmp21554 = getelementptr inbounds float* %tmp21553, i64 1
+  %tmp21555 = getelementptr inbounds float* %tmp21554, i64 1
+  %tmp21556 = getelementptr inbounds float* %tmp21555, i64 1
+  %tmp21557 = getelementptr inbounds float* %tmp21556, i64 1
+  %tmp21558 = getelementptr inbounds float* %tmp21557, i64 1
+  %tmp21559 = getelementptr inbounds float* %tmp21558, i64 1
+  %tmp21560 = getelementptr inbounds float* %tmp21559, i64 1
+  %tmp21561 = getelementptr inbounds float* %tmp21560, i64 1
+  %tmp21562 = getelementptr inbounds float* %tmp21561, i64 1
+  %tmp21563 = getelementptr inbounds float* %tmp21562, i64 1
+  %tmp21564 = getelementptr inbounds float* %tmp21563, i64 1
+  %tmp21565 = getelementptr inbounds float* %tmp21564, i64 1
+  %tmp21566 = getelementptr inbounds float* %tmp21565, i64 1
+  %tmp21567 = getelementptr inbounds float* %tmp21566, i64 1
+  %tmp21568 = getelementptr inbounds float* %tmp21567, i64 1
+  %tmp21569 = getelementptr inbounds float* %tmp21568, i64 1
+  %tmp21570 = getelementptr inbounds float* %tmp21569, i64 1
+  %tmp21571 = getelementptr inbounds float* %tmp21570, i64 1
+  %tmp21572 = getelementptr inbounds float* %tmp21571, i64 1
+  %tmp21573 = getelementptr inbounds float* %tmp21572, i64 1
+  %tmp21574 = getelementptr inbounds float* %tmp21573, i64 1
+  %tmp21575 = getelementptr inbounds float* %tmp21574, i64 1
+  %tmp21576 = getelementptr inbounds float* %tmp21575, i64 1
+  %tmp21577 = getelementptr inbounds float* %tmp21576, i64 1
+  %tmp21578 = getelementptr inbounds float* %tmp21577, i64 1
+  %tmp21579 = getelementptr inbounds float* %tmp21578, i64 1
+  %tmp21580 = getelementptr inbounds float* %tmp21579, i64 1
+  %tmp21581 = getelementptr inbounds float* %tmp21580, i64 1
+  %tmp21582 = getelementptr inbounds float* %tmp21581, i64 1
+  %tmp21583 = getelementptr inbounds float* %tmp21582, i64 1
+  %tmp21584 = getelementptr inbounds float* %tmp21583, i64 1
+  %tmp21585 = getelementptr inbounds float* %tmp21584, i64 1
+  %tmp21586 = getelementptr inbounds float* %tmp21585, i64 1
+  %tmp21587 = getelementptr inbounds float* %tmp21586, i64 1
+  %tmp21588 = getelementptr inbounds float* %tmp21587, i64 1
+  %tmp21589 = getelementptr inbounds float* %tmp21588, i64 1
+  %tmp21590 = getelementptr inbounds float* %tmp21589, i64 1
+  %tmp21591 = getelementptr inbounds float* %tmp21590, i64 1
+  %tmp21592 = getelementptr inbounds float* %tmp21591, i64 1
+  %tmp21593 = getelementptr inbounds float* %tmp21592, i64 1
+  %tmp21594 = getelementptr inbounds float* %tmp21593, i64 1
+  %tmp21595 = getelementptr inbounds float* %tmp21594, i64 1
+  %tmp21596 = getelementptr inbounds float* %tmp21595, i64 1
+  %tmp21597 = getelementptr inbounds float* %tmp21596, i64 1
+  %tmp21598 = getelementptr inbounds float* %tmp21597, i64 1
+  %tmp21599 = getelementptr inbounds float* %tmp21598, i64 1
+  %tmp21600 = getelementptr inbounds float* %tmp21599, i64 1
+  %tmp21601 = getelementptr inbounds float* %tmp21600, i64 1
+  %tmp21602 = getelementptr inbounds float* %tmp21601, i64 1
+  %tmp21603 = getelementptr inbounds float* %tmp21602, i64 1
+  %tmp21604 = getelementptr inbounds float* %tmp21603, i64 1
+  %tmp21605 = getelementptr inbounds float* %tmp21604, i64 1
+  %tmp21606 = getelementptr inbounds float* %tmp21605, i64 1
+  %tmp21607 = getelementptr inbounds float* %tmp21606, i64 1
+  %tmp21608 = getelementptr inbounds float* %tmp21607, i64 1
+  %tmp21609 = getelementptr inbounds float* %tmp21608, i64 1
+  %tmp21610 = getelementptr inbounds float* %tmp21609, i64 1
+  %tmp21611 = getelementptr inbounds float* %tmp21610, i64 1
+  %tmp21612 = getelementptr inbounds float* %tmp21611, i64 1
+  %tmp21613 = getelementptr inbounds float* %tmp21612, i64 1
+  %tmp21614 = getelementptr inbounds float* %tmp21613, i64 1
+  %tmp21615 = getelementptr inbounds float* %tmp21614, i64 1
+  %tmp21616 = getelementptr inbounds float* %tmp21615, i64 1
+  %tmp21617 = getelementptr inbounds float* %tmp21616, i64 1
+  %tmp21618 = getelementptr inbounds float* %tmp21617, i64 1
+  %tmp21619 = getelementptr inbounds float* %tmp21618, i64 1
+  %tmp21620 = getelementptr inbounds float* %tmp21619, i64 1
+  %tmp21621 = getelementptr inbounds float* %tmp21620, i64 1
+  %tmp21622 = getelementptr inbounds float* %tmp21621, i64 1
+  %tmp21623 = getelementptr inbounds float* %tmp21622, i64 1
+  %tmp21624 = getelementptr inbounds float* %tmp21623, i64 1
+  %tmp21625 = getelementptr inbounds float* %tmp21624, i64 1
+  %tmp21626 = getelementptr inbounds float* %tmp21625, i64 1
+  %tmp21627 = getelementptr inbounds float* %tmp21626, i64 1
+  %tmp21628 = getelementptr inbounds float* %tmp21627, i64 1
+  %tmp21629 = getelementptr inbounds float* %tmp21628, i64 1
+  %tmp21630 = getelementptr inbounds float* %tmp21629, i64 1
+  %tmp21631 = getelementptr inbounds float* %tmp21630, i64 1
+  %tmp21632 = getelementptr inbounds float* %tmp21631, i64 1
+  %tmp21633 = getelementptr inbounds float* %tmp21632, i64 1
+  %tmp21634 = getelementptr inbounds float* %tmp21633, i64 1
+  %tmp21635 = getelementptr inbounds float* %tmp21634, i64 1
+  %tmp21636 = getelementptr inbounds float* %tmp21635, i64 1
+  %tmp21637 = getelementptr inbounds float* %tmp21636, i64 1
+  %tmp21638 = getelementptr inbounds float* %tmp21637, i64 1
+  %tmp21639 = getelementptr inbounds float* %tmp21638, i64 1
+  %tmp21640 = getelementptr inbounds float* %tmp21639, i64 1
+  %tmp21641 = getelementptr inbounds float* %tmp21640, i64 1
+  %tmp21642 = getelementptr inbounds float* %tmp21641, i64 1
+  %tmp21643 = getelementptr inbounds float* %tmp21642, i64 1
+  %tmp21644 = getelementptr inbounds float* %tmp21643, i64 1
+  %tmp21645 = getelementptr inbounds float* %tmp21644, i64 1
+  %tmp21646 = getelementptr inbounds float* %tmp21645, i64 1
+  %tmp21647 = getelementptr inbounds float* %tmp21646, i64 1
+  %tmp21648 = getelementptr inbounds float* %tmp21647, i64 1
+  %tmp21649 = getelementptr inbounds float* %tmp21648, i64 1
+  %tmp21650 = getelementptr inbounds float* %tmp21649, i64 1
+  %tmp21651 = getelementptr inbounds float* %tmp21650, i64 1
+  %tmp21652 = getelementptr inbounds float* %tmp21651, i64 1
+  %tmp21653 = getelementptr inbounds float* %tmp21652, i64 1
+  %tmp21654 = getelementptr inbounds float* %tmp21653, i64 1
+  %tmp21655 = getelementptr inbounds float* %tmp21654, i64 1
+  %tmp21656 = getelementptr inbounds float* %tmp21655, i64 1
+  %tmp21657 = getelementptr inbounds float* %tmp21656, i64 1
+  %tmp21658 = getelementptr inbounds float* %tmp21657, i64 1
+  %tmp21659 = getelementptr inbounds float* %tmp21658, i64 1
+  %tmp21660 = getelementptr inbounds float* %tmp21659, i64 1
+  %tmp21661 = getelementptr inbounds float* %tmp21660, i64 1
+  %tmp21662 = getelementptr inbounds float* %tmp21661, i64 1
+  %tmp21663 = getelementptr inbounds float* %tmp21662, i64 1
+  %tmp21664 = getelementptr inbounds float* %tmp21663, i64 1
+  %tmp21665 = getelementptr inbounds float* %tmp21664, i64 1
+  %tmp21666 = getelementptr inbounds float* %tmp21665, i64 1
+  %tmp21667 = getelementptr inbounds float* %tmp21666, i64 1
+  %tmp21668 = getelementptr inbounds float* %tmp21667, i64 1
+  %tmp21669 = getelementptr inbounds float* %tmp21668, i64 1
+  %tmp21670 = getelementptr inbounds float* %tmp21669, i64 1
+  %tmp21671 = getelementptr inbounds float* %tmp21670, i64 1
+  %tmp21672 = getelementptr inbounds float* %tmp21671, i64 1
+  %tmp21673 = getelementptr inbounds float* %tmp21672, i64 1
+  %tmp21674 = getelementptr inbounds float* %tmp21673, i64 1
+  %tmp21675 = getelementptr inbounds float* %tmp21674, i64 1
+  %tmp21676 = getelementptr inbounds float* %tmp21675, i64 1
+  %tmp21677 = getelementptr inbounds float* %tmp21676, i64 1
+  %tmp21678 = getelementptr inbounds float* %tmp21677, i64 1
+  %tmp21679 = getelementptr inbounds float* %tmp21678, i64 1
+  %tmp21680 = getelementptr inbounds float* %tmp21679, i64 1
+  %tmp21681 = getelementptr inbounds float* %tmp21680, i64 1
+  %tmp21682 = getelementptr inbounds float* %tmp21681, i64 1
+  %tmp21683 = getelementptr inbounds float* %tmp21682, i64 1
+  %tmp21684 = getelementptr inbounds float* %tmp21683, i64 1
+  %tmp21685 = getelementptr inbounds float* %tmp21684, i64 1
+  %tmp21686 = getelementptr inbounds float* %tmp21685, i64 1
+  %tmp21687 = getelementptr inbounds float* %tmp21686, i64 1
+  %tmp21688 = getelementptr inbounds float* %tmp21687, i64 1
+  %tmp21689 = getelementptr inbounds float* %tmp21688, i64 1
+  %tmp21690 = getelementptr inbounds float* %tmp21689, i64 1
+  %tmp21691 = getelementptr inbounds float* %tmp21690, i64 1
+  %tmp21692 = getelementptr inbounds float* %tmp21691, i64 1
+  %tmp21693 = getelementptr inbounds float* %tmp21692, i64 1
+  %tmp21694 = getelementptr inbounds float* %tmp21693, i64 1
+  %tmp21695 = getelementptr inbounds float* %tmp21694, i64 1
+  %tmp21696 = getelementptr inbounds float* %tmp21695, i64 1
+  %tmp21697 = getelementptr inbounds float* %tmp21696, i64 1
+  %tmp21698 = getelementptr inbounds float* %tmp21697, i64 1
+  %tmp21699 = getelementptr inbounds float* %tmp21698, i64 1
+  %tmp21700 = getelementptr inbounds float* %tmp21699, i64 1
+  %tmp21701 = getelementptr inbounds float* %tmp21700, i64 1
+  %tmp21702 = getelementptr inbounds float* %tmp21701, i64 1
+  %tmp21703 = getelementptr inbounds float* %tmp21702, i64 1
+  %tmp21704 = getelementptr inbounds float* %tmp21703, i64 1
+  %tmp21705 = getelementptr inbounds float* %tmp21704, i64 1
+  %tmp21706 = getelementptr inbounds float* %tmp21705, i64 1
+  %tmp21707 = getelementptr inbounds float* %tmp21706, i64 1
+  %tmp21708 = getelementptr inbounds float* %tmp21707, i64 1
+  %tmp21709 = getelementptr inbounds float* %tmp21708, i64 1
+  %tmp21710 = getelementptr inbounds float* %tmp21709, i64 1
+  %tmp21711 = getelementptr inbounds float* %tmp21710, i64 1
+  %tmp21712 = getelementptr inbounds float* %tmp21711, i64 1
+  %tmp21713 = getelementptr inbounds float* %tmp21712, i64 1
+  %tmp21714 = getelementptr inbounds float* %tmp21713, i64 1
+  %tmp21715 = getelementptr inbounds float* %tmp21714, i64 1
+  %tmp21716 = getelementptr inbounds float* %tmp21715, i64 1
+  %tmp21717 = getelementptr inbounds float* %tmp21716, i64 1
+  %tmp21718 = getelementptr inbounds float* %tmp21717, i64 1
+  %tmp21719 = getelementptr inbounds float* %tmp21718, i64 1
+  %tmp21720 = getelementptr inbounds float* %tmp21719, i64 1
+  %tmp21721 = getelementptr inbounds float* %tmp21720, i64 1
+  %tmp21722 = getelementptr inbounds float* %tmp21721, i64 1
+  %tmp21723 = getelementptr inbounds float* %tmp21722, i64 1
+  %tmp21724 = getelementptr inbounds float* %tmp21723, i64 1
+  %tmp21725 = getelementptr inbounds float* %tmp21724, i64 1
+  %tmp21726 = getelementptr inbounds float* %tmp21725, i64 1
+  %tmp21727 = getelementptr inbounds float* %tmp21726, i64 1
+  %tmp21728 = getelementptr inbounds float* %tmp21727, i64 1
+  %tmp21729 = getelementptr inbounds float* %tmp21728, i64 1
+  %tmp21730 = getelementptr inbounds float* %tmp21729, i64 1
+  %tmp21731 = getelementptr inbounds float* %tmp21730, i64 1
+  %tmp21732 = getelementptr inbounds float* %tmp21731, i64 1
+  %tmp21733 = getelementptr inbounds float* %tmp21732, i64 1
+  %tmp21734 = getelementptr inbounds float* %tmp21733, i64 1
+  %tmp21735 = getelementptr inbounds float* %tmp21734, i64 1
+  %tmp21736 = getelementptr inbounds float* %tmp21735, i64 1
+  %tmp21737 = getelementptr inbounds float* %tmp21736, i64 1
+  %tmp21738 = getelementptr inbounds float* %tmp21737, i64 1
+  %tmp21739 = getelementptr inbounds float* %tmp21738, i64 1
+  %tmp21740 = getelementptr inbounds float* %tmp21739, i64 1
+  %tmp21741 = getelementptr inbounds float* %tmp21740, i64 1
+  %tmp21742 = getelementptr inbounds float* %tmp21741, i64 1
+  %tmp21743 = getelementptr inbounds float* %tmp21742, i64 1
+  %tmp21744 = getelementptr inbounds float* %tmp21743, i64 1
+  %tmp21745 = getelementptr inbounds float* %tmp21744, i64 1
+  %tmp21746 = getelementptr inbounds float* %tmp21745, i64 1
+  %tmp21747 = getelementptr inbounds float* %tmp21746, i64 1
+  %tmp21748 = getelementptr inbounds float* %tmp21747, i64 1
+  %tmp21749 = getelementptr inbounds float* %tmp21748, i64 1
+  %tmp21750 = getelementptr inbounds float* %tmp21749, i64 1
+  %tmp21751 = getelementptr inbounds float* %tmp21750, i64 1
+  %tmp21752 = getelementptr inbounds float* %tmp21751, i64 1
+  %tmp21753 = getelementptr inbounds float* %tmp21752, i64 1
+  %tmp21754 = getelementptr inbounds float* %tmp21753, i64 1
+  %tmp21755 = getelementptr inbounds float* %tmp21754, i64 1
+  %tmp21756 = getelementptr inbounds float* %tmp21755, i64 1
+  %tmp21757 = getelementptr inbounds float* %tmp21756, i64 1
+  %tmp21758 = getelementptr inbounds float* %tmp21757, i64 1
+  %tmp21759 = getelementptr inbounds float* %tmp21758, i64 1
+  %tmp21760 = getelementptr inbounds float* %tmp21759, i64 1
+  %tmp21761 = getelementptr inbounds float* %tmp21760, i64 1
+  %tmp21762 = getelementptr inbounds float* %tmp21761, i64 1
+  %tmp21763 = getelementptr inbounds float* %tmp21762, i64 1
+  %tmp21764 = getelementptr inbounds float* %tmp21763, i64 1
+  %tmp21765 = getelementptr inbounds float* %tmp21764, i64 1
+  %tmp21766 = getelementptr inbounds float* %tmp21765, i64 1
+  %tmp21767 = getelementptr inbounds float* %tmp21766, i64 1
+  %tmp21768 = getelementptr inbounds float* %tmp21767, i64 1
+  %tmp21769 = getelementptr inbounds float* %tmp21768, i64 1
+  %tmp21770 = getelementptr inbounds float* %tmp21769, i64 1
+  %tmp21771 = getelementptr inbounds float* %tmp21770, i64 1
+  %tmp21772 = getelementptr inbounds float* %tmp21771, i64 1
+  %tmp21773 = getelementptr inbounds float* %tmp21772, i64 1
+  %tmp21774 = getelementptr inbounds float* %tmp21773, i64 1
+  %tmp21775 = getelementptr inbounds float* %tmp21774, i64 1
+  %tmp21776 = getelementptr inbounds float* %tmp21775, i64 1
+  %tmp21777 = getelementptr inbounds float* %tmp21776, i64 1
+  %tmp21778 = getelementptr inbounds float* %tmp21777, i64 1
+  %tmp21779 = getelementptr inbounds float* %tmp21778, i64 1
+  %tmp21780 = getelementptr inbounds float* %tmp21779, i64 1
+  %tmp21781 = getelementptr inbounds float* %tmp21780, i64 1
+  %tmp21782 = getelementptr inbounds float* %tmp21781, i64 1
+  %tmp21783 = getelementptr inbounds float* %tmp21782, i64 1
+  %tmp21784 = getelementptr inbounds float* %tmp21783, i64 1
+  %tmp21785 = getelementptr inbounds float* %tmp21784, i64 1
+  %tmp21786 = getelementptr inbounds float* %tmp21785, i64 1
+  %tmp21787 = getelementptr inbounds float* %tmp21786, i64 1
+  %tmp21788 = getelementptr inbounds float* %tmp21787, i64 1
+  %tmp21789 = getelementptr inbounds float* %tmp21788, i64 1
+  %tmp21790 = getelementptr inbounds float* %tmp21789, i64 1
+  %tmp21791 = getelementptr inbounds float* %tmp21790, i64 1
+  %tmp21792 = getelementptr inbounds float* %tmp21791, i64 1
+  %tmp21793 = getelementptr inbounds float* %tmp21792, i64 1
+  %tmp21794 = getelementptr inbounds float* %tmp21793, i64 1
+  %tmp21795 = getelementptr inbounds float* %tmp21794, i64 1
+  %tmp21796 = getelementptr inbounds float* %tmp21795, i64 1
+  %tmp21797 = getelementptr inbounds float* %tmp21796, i64 1
+  %tmp21798 = getelementptr inbounds float* %tmp21797, i64 1
+  %tmp21799 = getelementptr inbounds float* %tmp21798, i64 1
+  %tmp21800 = getelementptr inbounds float* %tmp21799, i64 1
+  %tmp21801 = getelementptr inbounds float* %tmp21800, i64 1
+  %tmp21802 = getelementptr inbounds float* %tmp21801, i64 1
+  %tmp21803 = getelementptr inbounds float* %tmp21802, i64 1
+  %tmp21804 = getelementptr inbounds float* %tmp21803, i64 1
+  %tmp21805 = getelementptr inbounds float* %tmp21804, i64 1
+  %tmp21806 = getelementptr inbounds float* %tmp21805, i64 1
+  %tmp21807 = getelementptr inbounds float* %tmp21806, i64 1
+  %tmp21808 = getelementptr inbounds float* %tmp21807, i64 1
+  %tmp21809 = getelementptr inbounds float* %tmp21808, i64 1
+  %tmp21810 = getelementptr inbounds float* %tmp21809, i64 1
+  %tmp21811 = getelementptr inbounds float* %tmp21810, i64 1
+  %tmp21812 = getelementptr inbounds float* %tmp21811, i64 1
+  %tmp21813 = getelementptr inbounds float* %tmp21812, i64 1
+  %tmp21814 = getelementptr inbounds float* %tmp21813, i64 1
+  %tmp21815 = getelementptr inbounds float* %tmp21814, i64 1
+  %tmp21816 = getelementptr inbounds float* %tmp21815, i64 1
+  %tmp21817 = getelementptr inbounds float* %tmp21816, i64 1
+  %tmp21818 = getelementptr inbounds float* %tmp21817, i64 1
+  %tmp21819 = getelementptr inbounds float* %tmp21818, i64 1
+  %tmp21820 = getelementptr inbounds float* %tmp21819, i64 1
+  %tmp21821 = getelementptr inbounds float* %tmp21820, i64 1
+  %tmp21822 = getelementptr inbounds float* %tmp21821, i64 1
+  %tmp21823 = getelementptr inbounds float* %tmp21822, i64 1
+  %tmp21824 = getelementptr inbounds float* %tmp21823, i64 1
+  %tmp21825 = getelementptr inbounds float* %tmp21824, i64 1
+  %tmp21826 = getelementptr inbounds float* %tmp21825, i64 1
+  %tmp21827 = getelementptr inbounds float* %tmp21826, i64 1
+  %tmp21828 = getelementptr inbounds float* %tmp21827, i64 1
+  %tmp21829 = getelementptr inbounds float* %tmp21828, i64 1
+  %tmp21830 = getelementptr inbounds float* %tmp21829, i64 1
+  %tmp21831 = getelementptr inbounds float* %tmp21830, i64 1
+  %tmp21832 = getelementptr inbounds float* %tmp21831, i64 1
+  %tmp21833 = getelementptr inbounds float* %tmp21832, i64 1
+  %tmp21834 = getelementptr inbounds float* %tmp21833, i64 1
+  %tmp21835 = getelementptr inbounds float* %tmp21834, i64 1
+  %tmp21836 = getelementptr inbounds float* %tmp21835, i64 1
+  %tmp21837 = getelementptr inbounds float* %tmp21836, i64 1
+  %tmp21838 = getelementptr inbounds float* %tmp21837, i64 1
+  %tmp21839 = getelementptr inbounds float* %tmp21838, i64 1
+  %tmp21840 = getelementptr inbounds float* %tmp21839, i64 1
+  %tmp21841 = getelementptr inbounds float* %tmp21840, i64 1
+  %tmp21842 = getelementptr inbounds float* %tmp21841, i64 1
+  %tmp21843 = getelementptr inbounds float* %tmp21842, i64 1
+  %tmp21844 = getelementptr inbounds float* %tmp21843, i64 1
+  %tmp21845 = getelementptr inbounds float* %tmp21844, i64 1
+  %tmp21846 = getelementptr inbounds float* %tmp21845, i64 1
+  %tmp21847 = getelementptr inbounds float* %tmp21846, i64 1
+  %tmp21848 = getelementptr inbounds float* %tmp21847, i64 1
+  %tmp21849 = getelementptr inbounds float* %tmp21848, i64 1
+  %tmp21850 = getelementptr inbounds float* %tmp21849, i64 1
+  %tmp21851 = getelementptr inbounds float* %tmp21850, i64 1
+  %tmp21852 = getelementptr inbounds float* %tmp21851, i64 1
+  %tmp21853 = getelementptr inbounds float* %tmp21852, i64 1
+  %tmp21854 = getelementptr inbounds float* %tmp21853, i64 1
+  %tmp21855 = getelementptr inbounds float* %tmp21854, i64 1
+  %tmp21856 = getelementptr inbounds float* %tmp21855, i64 1
+  %tmp21857 = getelementptr inbounds float* %tmp21856, i64 1
+  %tmp21858 = getelementptr inbounds float* %tmp21857, i64 1
+  %tmp21859 = getelementptr inbounds float* %tmp21858, i64 1
+  %tmp21860 = getelementptr inbounds float* %tmp21859, i64 1
+  %tmp21861 = getelementptr inbounds float* %tmp21860, i64 1
+  %tmp21862 = getelementptr inbounds float* %tmp21861, i64 1
+  %tmp21863 = getelementptr inbounds float* %tmp21862, i64 1
+  %tmp21864 = getelementptr inbounds float* %tmp21863, i64 1
+  %tmp21865 = getelementptr inbounds float* %tmp21864, i64 1
+  %tmp21866 = getelementptr inbounds float* %tmp21865, i64 1
+  %tmp21867 = getelementptr inbounds float* %tmp21866, i64 1
+  %tmp21868 = getelementptr inbounds float* %tmp21867, i64 1
+  %tmp21869 = getelementptr inbounds float* %tmp21868, i64 1
+  %tmp21870 = getelementptr inbounds float* %tmp21869, i64 1
+  %tmp21871 = getelementptr inbounds float* %tmp21870, i64 1
+  %tmp21872 = getelementptr inbounds float* %tmp21871, i64 1
+  %tmp21873 = getelementptr inbounds float* %tmp21872, i64 1
+  %tmp21874 = getelementptr inbounds float* %tmp21873, i64 1
+  %tmp21875 = getelementptr inbounds float* %tmp21874, i64 1
+  %tmp21876 = getelementptr inbounds float* %tmp21875, i64 1
+  %tmp21877 = getelementptr inbounds float* %tmp21876, i64 1
+  %tmp21878 = getelementptr inbounds float* %tmp21877, i64 1
+  %tmp21879 = getelementptr inbounds float* %tmp21878, i64 1
+  %tmp21880 = getelementptr inbounds float* %tmp21879, i64 1
+  %tmp21881 = getelementptr inbounds float* %tmp21880, i64 1
+  %tmp21882 = getelementptr inbounds float* %tmp21881, i64 1
+  %tmp21883 = getelementptr inbounds float* %tmp21882, i64 1
+  %tmp21884 = getelementptr inbounds float* %tmp21883, i64 1
+  %tmp21885 = getelementptr inbounds float* %tmp21884, i64 1
+  %tmp21886 = getelementptr inbounds float* %tmp21885, i64 1
+  %tmp21887 = getelementptr inbounds float* %tmp21886, i64 1
+  %tmp21888 = getelementptr inbounds float* %tmp21887, i64 1
+  %tmp21889 = getelementptr inbounds float* %tmp21888, i64 1
+  %tmp21890 = getelementptr inbounds float* %tmp21889, i64 1
+  %tmp21891 = getelementptr inbounds float* %tmp21890, i64 1
+  %tmp21892 = getelementptr inbounds float* %tmp21891, i64 1
+  %tmp21893 = getelementptr inbounds float* %tmp21892, i64 1
+  %tmp21894 = getelementptr inbounds float* %tmp21893, i64 1
+  %tmp21895 = getelementptr inbounds float* %tmp21894, i64 1
+  %tmp21896 = getelementptr inbounds float* %tmp21895, i64 1
+  %tmp21897 = getelementptr inbounds float* %tmp21896, i64 1
+  %tmp21898 = getelementptr inbounds float* %tmp21897, i64 1
+  %tmp21899 = getelementptr inbounds float* %tmp21898, i64 1
+  %tmp21900 = getelementptr inbounds float* %tmp21899, i64 1
+  %tmp21901 = getelementptr inbounds float* %tmp21900, i64 1
+  %tmp21902 = getelementptr inbounds float* %tmp21901, i64 1
+  %tmp21903 = getelementptr inbounds float* %tmp21902, i64 1
+  %tmp21904 = getelementptr inbounds float* %tmp21903, i64 1
+  %tmp21905 = getelementptr inbounds float* %tmp21904, i64 1
+  %tmp21906 = getelementptr inbounds float* %tmp21905, i64 1
+  %tmp21907 = getelementptr inbounds float* %tmp21906, i64 1
+  %tmp21908 = getelementptr inbounds float* %tmp21907, i64 1
+  %tmp21909 = getelementptr inbounds float* %tmp21908, i64 1
+  %tmp21910 = getelementptr inbounds float* %tmp21909, i64 1
+  %tmp21911 = getelementptr inbounds float* %tmp21910, i64 1
+  %tmp21912 = getelementptr inbounds float* %tmp21911, i64 1
+  %tmp21913 = getelementptr inbounds float* %tmp21912, i64 1
+  %tmp21914 = getelementptr inbounds float* %tmp21913, i64 1
+  %tmp21915 = getelementptr inbounds float* %tmp21914, i64 1
+  %tmp21916 = getelementptr inbounds float* %tmp21915, i64 1
+  %tmp21917 = getelementptr inbounds float* %tmp21916, i64 1
+  %tmp21918 = getelementptr inbounds float* %tmp21917, i64 1
+  %tmp21919 = getelementptr inbounds float* %tmp21918, i64 1
+  %tmp21920 = getelementptr inbounds float* %tmp21919, i64 1
+  %tmp21921 = getelementptr inbounds float* %tmp21920, i64 1
+  %tmp21922 = getelementptr inbounds float* %tmp21921, i64 1
+  %tmp21923 = getelementptr inbounds float* %tmp21922, i64 1
+  %tmp21924 = getelementptr inbounds float* %tmp21923, i64 1
+  %tmp21925 = getelementptr inbounds float* %tmp21924, i64 1
+  %tmp21926 = getelementptr inbounds float* %tmp21925, i64 1
+  %tmp21927 = getelementptr inbounds float* %tmp21926, i64 1
+  %tmp21928 = getelementptr inbounds float* %tmp21927, i64 1
+  %tmp21929 = getelementptr inbounds float* %tmp21928, i64 1
+  %tmp21930 = getelementptr inbounds float* %tmp21929, i64 1
+  %tmp21931 = getelementptr inbounds float* %tmp21930, i64 1
+  %tmp21932 = getelementptr inbounds float* %tmp21931, i64 1
+  %tmp21933 = getelementptr inbounds float* %tmp21932, i64 1
+  %tmp21934 = getelementptr inbounds float* %tmp21933, i64 1
+  %tmp21935 = getelementptr inbounds float* %tmp21934, i64 1
+  %tmp21936 = getelementptr inbounds float* %tmp21935, i64 1
+  %tmp21937 = getelementptr inbounds float* %tmp21936, i64 1
+  %tmp21938 = getelementptr inbounds float* %tmp21937, i64 1
+  %tmp21939 = getelementptr inbounds float* %tmp21938, i64 1
+  %tmp21940 = getelementptr inbounds float* %tmp21939, i64 1
+  %tmp21941 = getelementptr inbounds float* %tmp21940, i64 1
+  %tmp21942 = getelementptr inbounds float* %tmp21941, i64 1
+  %tmp21943 = getelementptr inbounds float* %tmp21942, i64 1
+  %tmp21944 = getelementptr inbounds float* %tmp21943, i64 1
+  %tmp21945 = getelementptr inbounds float* %tmp21944, i64 1
+  %tmp21946 = getelementptr inbounds float* %tmp21945, i64 1
+  %tmp21947 = getelementptr inbounds float* %tmp21946, i64 1
+  %tmp21948 = getelementptr inbounds float* %tmp21947, i64 1
+  %tmp21949 = getelementptr inbounds float* %tmp21948, i64 1
+  %tmp21950 = getelementptr inbounds float* %tmp21949, i64 1
+  %tmp21951 = getelementptr inbounds float* %tmp21950, i64 1
+  %tmp21952 = getelementptr inbounds float* %tmp21951, i64 1
+  %tmp21953 = getelementptr inbounds float* %tmp21952, i64 1
+  %tmp21954 = getelementptr inbounds float* %tmp21953, i64 1
+  %tmp21955 = getelementptr inbounds float* %tmp21954, i64 1
+  %tmp21956 = getelementptr inbounds float* %tmp21955, i64 1
+  %tmp21957 = getelementptr inbounds float* %tmp21956, i64 1
+  %tmp21958 = getelementptr inbounds float* %tmp21957, i64 1
+  %tmp21959 = getelementptr inbounds float* %tmp21958, i64 1
+  %tmp21960 = getelementptr inbounds float* %tmp21959, i64 1
+  %tmp21961 = getelementptr inbounds float* %tmp21960, i64 1
+  %tmp21962 = getelementptr inbounds float* %tmp21961, i64 1
+  %tmp21963 = getelementptr inbounds float* %tmp21962, i64 1
+  %tmp21964 = getelementptr inbounds float* %tmp21963, i64 1
+  %tmp21965 = getelementptr inbounds float* %tmp21964, i64 1
+  %tmp21966 = getelementptr inbounds float* %tmp21965, i64 1
+  %tmp21967 = getelementptr inbounds float* %tmp21966, i64 1
+  %tmp21968 = getelementptr inbounds float* %tmp21967, i64 1
+  %tmp21969 = getelementptr inbounds float* %tmp21968, i64 1
+  %tmp21970 = getelementptr inbounds float* %tmp21969, i64 1
+  %tmp21971 = getelementptr inbounds float* %tmp21970, i64 1
+  %tmp21972 = getelementptr inbounds float* %tmp21971, i64 1
+  %tmp21973 = getelementptr inbounds float* %tmp21972, i64 1
+  %tmp21974 = getelementptr inbounds float* %tmp21973, i64 1
+  %tmp21975 = getelementptr inbounds float* %tmp21974, i64 1
+  %tmp21976 = getelementptr inbounds float* %tmp21975, i64 1
+  %tmp21977 = getelementptr inbounds float* %tmp21976, i64 1
+  %tmp21978 = getelementptr inbounds float* %tmp21977, i64 1
+  %tmp21979 = getelementptr inbounds float* %tmp21978, i64 1
+  %tmp21980 = getelementptr inbounds float* %tmp21979, i64 1
+  %tmp21981 = getelementptr inbounds float* %tmp21980, i64 1
+  %tmp21982 = getelementptr inbounds float* %tmp21981, i64 1
+  %tmp21983 = getelementptr inbounds float* %tmp21982, i64 1
+  %tmp21984 = getelementptr inbounds float* %tmp21983, i64 1
+  %tmp21985 = getelementptr inbounds float* %tmp21984, i64 1
+  %tmp21986 = getelementptr inbounds float* %tmp21985, i64 1
+  %tmp21987 = getelementptr inbounds float* %tmp21986, i64 1
+  %tmp21988 = getelementptr inbounds float* %tmp21987, i64 1
+  %tmp21989 = getelementptr inbounds float* %tmp21988, i64 1
+  %tmp21990 = getelementptr inbounds float* %tmp21989, i64 1
+  %tmp21991 = getelementptr inbounds float* %tmp21990, i64 1
+  %tmp21992 = getelementptr inbounds float* %tmp21991, i64 1
+  %tmp21993 = getelementptr inbounds float* %tmp21992, i64 1
+  %tmp21994 = getelementptr inbounds float* %tmp21993, i64 1
+  %tmp21995 = getelementptr inbounds float* %tmp21994, i64 1
+  %tmp21996 = getelementptr inbounds float* %tmp21995, i64 1
+  %tmp21997 = getelementptr inbounds float* %tmp21996, i64 1
+  %tmp21998 = getelementptr inbounds float* %tmp21997, i64 1
+  %tmp21999 = getelementptr inbounds float* %tmp21998, i64 1
+  %tmp22000 = getelementptr inbounds float* %tmp21999, i64 1
+  %tmp22001 = getelementptr inbounds float* %tmp22000, i64 1
+  %tmp22002 = getelementptr inbounds float* %tmp22001, i64 1
+  %tmp22003 = getelementptr inbounds float* %tmp22002, i64 1
+  %tmp22004 = getelementptr inbounds float* %tmp22003, i64 1
+  %tmp22005 = getelementptr inbounds float* %tmp22004, i64 1
+  %tmp22006 = getelementptr inbounds float* %tmp22005, i64 1
+  %tmp22007 = getelementptr inbounds float* %tmp22006, i64 1
+  %tmp22008 = getelementptr inbounds float* %tmp22007, i64 1
+  %tmp22009 = getelementptr inbounds float* %tmp22008, i64 1
+  %tmp22010 = getelementptr inbounds float* %tmp22009, i64 1
+  %tmp22011 = getelementptr inbounds float* %tmp22010, i64 1
+  %tmp22012 = getelementptr inbounds float* %tmp22011, i64 1
+  %tmp22013 = getelementptr inbounds float* %tmp22012, i64 1
+  %tmp22014 = getelementptr inbounds float* %tmp22013, i64 1
+  %tmp22015 = getelementptr inbounds float* %tmp22014, i64 1
+  %tmp22016 = getelementptr inbounds float* %tmp22015, i64 1
+  %tmp22017 = getelementptr inbounds float* %tmp22016, i64 1
+  %tmp22018 = getelementptr inbounds float* %tmp22017, i64 1
+  %tmp22019 = getelementptr inbounds float* %tmp22018, i64 1
+  %tmp22020 = getelementptr inbounds float* %tmp22019, i64 1
+  %tmp22021 = getelementptr inbounds float* %tmp22020, i64 1
+  %tmp22022 = getelementptr inbounds float* %tmp22021, i64 1
+  %tmp22023 = getelementptr inbounds float* %tmp22022, i64 1
+  %tmp22024 = getelementptr inbounds float* %tmp22023, i64 1
+  %tmp22025 = getelementptr inbounds float* %tmp22024, i64 1
+  %tmp22026 = getelementptr inbounds float* %tmp22025, i64 1
+  %tmp22027 = getelementptr inbounds float* %tmp22026, i64 1
+  %tmp22028 = getelementptr inbounds float* %tmp22027, i64 1
+  %tmp22029 = getelementptr inbounds float* %tmp22028, i64 1
+  %tmp22030 = getelementptr inbounds float* %tmp22029, i64 1
+  %tmp22031 = getelementptr inbounds float* %tmp22030, i64 1
+  %tmp22032 = getelementptr inbounds float* %tmp22031, i64 1
+  %tmp22033 = getelementptr inbounds float* %tmp22032, i64 1
+  %tmp22034 = getelementptr inbounds float* %tmp22033, i64 1
+  %tmp22035 = getelementptr inbounds float* %tmp22034, i64 1
+  %tmp22036 = getelementptr inbounds float* %tmp22035, i64 1
+  %tmp22037 = getelementptr inbounds float* %tmp22036, i64 1
+  %tmp22038 = getelementptr inbounds float* %tmp22037, i64 1
+  %tmp22039 = getelementptr inbounds float* %tmp22038, i64 1
+  %tmp22040 = getelementptr inbounds float* %tmp22039, i64 1
+  %tmp22041 = getelementptr inbounds float* %tmp22040, i64 1
+  %tmp22042 = getelementptr inbounds float* %tmp22041, i64 1
+  %tmp22043 = getelementptr inbounds float* %tmp22042, i64 1
+  %tmp22044 = getelementptr inbounds float* %tmp22043, i64 1
+  %tmp22045 = getelementptr inbounds float* %tmp22044, i64 1
+  %tmp22046 = getelementptr inbounds float* %tmp22045, i64 1
+  %tmp22047 = getelementptr inbounds float* %tmp22046, i64 1
+  %tmp22048 = getelementptr inbounds float* %tmp22047, i64 1
+  %tmp22049 = getelementptr inbounds float* %tmp22048, i64 1
+  %tmp22050 = getelementptr inbounds float* %tmp22049, i64 1
+  %tmp22051 = getelementptr inbounds float* %tmp22050, i64 1
+  %tmp22052 = getelementptr inbounds float* %tmp22051, i64 1
+  %tmp22053 = getelementptr inbounds float* %tmp22052, i64 1
+  %tmp22054 = getelementptr inbounds float* %tmp22053, i64 1
+  %tmp22055 = getelementptr inbounds float* %tmp22054, i64 1
+  %tmp22056 = getelementptr inbounds float* %tmp22055, i64 1
+  %tmp22057 = getelementptr inbounds float* %tmp22056, i64 1
+  %tmp22058 = getelementptr inbounds float* %tmp22057, i64 1
+  %tmp22059 = getelementptr inbounds float* %tmp22058, i64 1
+  %tmp22060 = getelementptr inbounds float* %tmp22059, i64 1
+  %tmp22061 = getelementptr inbounds float* %tmp22060, i64 1
+  %tmp22062 = getelementptr inbounds float* %tmp22061, i64 1
+  %tmp22063 = getelementptr inbounds float* %tmp22062, i64 1
+  %tmp22064 = getelementptr inbounds float* %tmp22063, i64 1
+  %tmp22065 = getelementptr inbounds float* %tmp22064, i64 1
+  %tmp22066 = getelementptr inbounds float* %tmp22065, i64 1
+  %tmp22067 = getelementptr inbounds float* %tmp22066, i64 1
+  %tmp22068 = getelementptr inbounds float* %tmp22067, i64 1
+  %tmp22069 = getelementptr inbounds float* %tmp22068, i64 1
+  %tmp22070 = getelementptr inbounds float* %tmp22069, i64 1
+  %tmp22071 = getelementptr inbounds float* %tmp22070, i64 1
+  %tmp22072 = getelementptr inbounds float* %tmp22071, i64 1
+  %tmp22073 = getelementptr inbounds float* %tmp22072, i64 1
+  %tmp22074 = getelementptr inbounds float* %tmp22073, i64 1
+  %tmp22075 = getelementptr inbounds float* %tmp22074, i64 1
+  %tmp22076 = getelementptr inbounds float* %tmp22075, i64 1
+  %tmp22077 = getelementptr inbounds float* %tmp22076, i64 1
+  %tmp22078 = getelementptr inbounds float* %tmp22077, i64 1
+  %tmp22079 = getelementptr inbounds float* %tmp22078, i64 1
+  %tmp22080 = getelementptr inbounds float* %tmp22079, i64 1
+  %tmp22081 = getelementptr inbounds float* %tmp22080, i64 1
+  %tmp22082 = getelementptr inbounds float* %tmp22081, i64 1
+  %tmp22083 = getelementptr inbounds float* %tmp22082, i64 1
+  %tmp22084 = getelementptr inbounds float* %tmp22083, i64 1
+  %tmp22085 = getelementptr inbounds float* %tmp22084, i64 1
+  %tmp22086 = getelementptr inbounds float* %tmp22085, i64 1
+  %tmp22087 = getelementptr inbounds float* %tmp22086, i64 1
+  %tmp22088 = getelementptr inbounds float* %tmp22087, i64 1
+  %tmp22089 = getelementptr inbounds float* %tmp22088, i64 1
+  %tmp22090 = getelementptr inbounds float* %tmp22089, i64 1
+  %tmp22091 = getelementptr inbounds float* %tmp22090, i64 1
+  %tmp22092 = getelementptr inbounds float* %tmp22091, i64 1
+  %tmp22093 = getelementptr inbounds float* %tmp22092, i64 1
+  %tmp22094 = getelementptr inbounds float* %tmp22093, i64 1
+  %tmp22095 = getelementptr inbounds float* %tmp22094, i64 1
+  %tmp22096 = getelementptr inbounds float* %tmp22095, i64 1
+  %tmp22097 = getelementptr inbounds float* %tmp22096, i64 1
+  %tmp22098 = getelementptr inbounds float* %tmp22097, i64 1
+  %tmp22099 = getelementptr inbounds float* %tmp22098, i64 1
+  %tmp22100 = getelementptr inbounds float* %tmp22099, i64 1
+  %tmp22101 = getelementptr inbounds float* %tmp22100, i64 1
+  %tmp22102 = getelementptr inbounds float* %tmp22101, i64 1
+  %tmp22103 = getelementptr inbounds float* %tmp22102, i64 1
+  %tmp22104 = getelementptr inbounds float* %tmp22103, i64 1
+  %tmp22105 = getelementptr inbounds float* %tmp22104, i64 1
+  %tmp22106 = getelementptr inbounds float* %tmp22105, i64 1
+  %tmp22107 = getelementptr inbounds float* %tmp22106, i64 1
+  %tmp22108 = getelementptr inbounds float* %tmp22107, i64 1
+  %tmp22109 = getelementptr inbounds float* %tmp22108, i64 1
+  %tmp22110 = getelementptr inbounds float* %tmp22109, i64 1
+  %tmp22111 = getelementptr inbounds float* %tmp22110, i64 1
+  %tmp22112 = getelementptr inbounds float* %tmp22111, i64 1
+  %tmp22113 = getelementptr inbounds float* %tmp22112, i64 1
+  %tmp22114 = getelementptr inbounds float* %tmp22113, i64 1
+  %tmp22115 = getelementptr inbounds float* %tmp22114, i64 1
+  %tmp22116 = getelementptr inbounds float* %tmp22115, i64 1
+  %tmp22117 = getelementptr inbounds float* %tmp22116, i64 1
+  %tmp22118 = getelementptr inbounds float* %tmp22117, i64 1
+  %tmp22119 = getelementptr inbounds float* %tmp22118, i64 1
+  %tmp22120 = getelementptr inbounds float* %tmp22119, i64 1
+  %tmp22121 = getelementptr inbounds float* %tmp22120, i64 1
+  %tmp22122 = getelementptr inbounds float* %tmp22121, i64 1
+  %tmp22123 = getelementptr inbounds float* %tmp22122, i64 1
+  %tmp22124 = getelementptr inbounds float* %tmp22123, i64 1
+  %tmp22125 = getelementptr inbounds float* %tmp22124, i64 1
+  %tmp22126 = getelementptr inbounds float* %tmp22125, i64 1
+  %tmp22127 = getelementptr inbounds float* %tmp22126, i64 1
+  %tmp22128 = getelementptr inbounds float* %tmp22127, i64 1
+  %tmp22129 = getelementptr inbounds float* %tmp22128, i64 1
+  %tmp22130 = getelementptr inbounds float* %tmp22129, i64 1
+  %tmp22131 = getelementptr inbounds float* %tmp22130, i64 1
+  %tmp22132 = getelementptr inbounds float* %tmp22131, i64 1
+  %tmp22133 = getelementptr inbounds float* %tmp22132, i64 1
+  %tmp22134 = getelementptr inbounds float* %tmp22133, i64 1
+  %tmp22135 = getelementptr inbounds float* %tmp22134, i64 1
+  %tmp22136 = getelementptr inbounds float* %tmp22135, i64 1
+  %tmp22137 = getelementptr inbounds float* %tmp22136, i64 1
+  %tmp22138 = getelementptr inbounds float* %tmp22137, i64 1
+  %tmp22139 = getelementptr inbounds float* %tmp22138, i64 1
+  %tmp22140 = getelementptr inbounds float* %tmp22139, i64 1
+  %tmp22141 = getelementptr inbounds float* %tmp22140, i64 1
+  %tmp22142 = getelementptr inbounds float* %tmp22141, i64 1
+  %tmp22143 = getelementptr inbounds float* %tmp22142, i64 1
+  %tmp22144 = getelementptr inbounds float* %tmp22143, i64 1
+  %tmp22145 = getelementptr inbounds float* %tmp22144, i64 1
+  %tmp22146 = getelementptr inbounds float* %tmp22145, i64 1
+  %tmp22147 = getelementptr inbounds float* %tmp22146, i64 1
+  %tmp22148 = getelementptr inbounds float* %tmp22147, i64 1
+  %tmp22149 = getelementptr inbounds float* %tmp22148, i64 1
+  %tmp22150 = getelementptr inbounds float* %tmp22149, i64 1
+  %tmp22151 = getelementptr inbounds float* %tmp22150, i64 1
+  %tmp22152 = getelementptr inbounds float* %tmp22151, i64 1
+  %tmp22153 = getelementptr inbounds float* %tmp22152, i64 1
+  %tmp22154 = getelementptr inbounds float* %tmp22153, i64 1
+  %tmp22155 = getelementptr inbounds float* %tmp22154, i64 1
+  %tmp22156 = getelementptr inbounds float* %tmp22155, i64 1
+  %tmp22157 = getelementptr inbounds float* %tmp22156, i64 1
+  %tmp22158 = getelementptr inbounds float* %tmp22157, i64 1
+  %tmp22159 = getelementptr inbounds float* %tmp22158, i64 1
+  %tmp22160 = getelementptr inbounds float* %tmp22159, i64 1
+  %tmp22161 = getelementptr inbounds float* %tmp22160, i64 1
+  %tmp22162 = getelementptr inbounds float* %tmp22161, i64 1
+  %tmp22163 = getelementptr inbounds float* %tmp22162, i64 1
+  %tmp22164 = getelementptr inbounds float* %tmp22163, i64 1
+  %tmp22165 = getelementptr inbounds float* %tmp22164, i64 1
+  %tmp22166 = getelementptr inbounds float* %tmp22165, i64 1
+  %tmp22167 = getelementptr inbounds float* %tmp22166, i64 1
+  %tmp22168 = getelementptr inbounds float* %tmp22167, i64 1
+  %tmp22169 = getelementptr inbounds float* %tmp22168, i64 1
+  %tmp22170 = getelementptr inbounds float* %tmp22169, i64 1
+  %tmp22171 = getelementptr inbounds float* %tmp22170, i64 1
+  %tmp22172 = getelementptr inbounds float* %tmp22171, i64 1
+  %tmp22173 = getelementptr inbounds float* %tmp22172, i64 1
+  %tmp22174 = getelementptr inbounds float* %tmp22173, i64 1
+  %tmp22175 = getelementptr inbounds float* %tmp22174, i64 1
+  %tmp22176 = getelementptr inbounds float* %tmp22175, i64 1
+  %tmp22177 = getelementptr inbounds float* %tmp22176, i64 1
+  %tmp22178 = getelementptr inbounds float* %tmp22177, i64 1
+  %tmp22179 = getelementptr inbounds float* %tmp22178, i64 1
+  %tmp22180 = getelementptr inbounds float* %tmp22179, i64 1
+  %tmp22181 = getelementptr inbounds float* %tmp22180, i64 1
+  %tmp22182 = getelementptr inbounds float* %tmp22181, i64 1
+  %tmp22183 = getelementptr inbounds float* %tmp22182, i64 1
+  %tmp22184 = getelementptr inbounds float* %tmp22183, i64 1
+  %tmp22185 = getelementptr inbounds float* %tmp22184, i64 1
+  %tmp22186 = getelementptr inbounds float* %tmp22185, i64 1
+  %tmp22187 = getelementptr inbounds float* %tmp22186, i64 1
+  %tmp22188 = getelementptr inbounds float* %tmp22187, i64 1
+  %tmp22189 = getelementptr inbounds float* %tmp22188, i64 1
+  %tmp22190 = getelementptr inbounds float* %tmp22189, i64 1
+  %tmp22191 = getelementptr inbounds float* %tmp22190, i64 1
+  %tmp22192 = getelementptr inbounds float* %tmp22191, i64 1
+  %tmp22193 = getelementptr inbounds float* %tmp22192, i64 1
+  %tmp22194 = getelementptr inbounds float* %tmp22193, i64 1
+  %tmp22195 = getelementptr inbounds float* %tmp22194, i64 1
+  %tmp22196 = getelementptr inbounds float* %tmp22195, i64 1
+  %tmp22197 = getelementptr inbounds float* %tmp22196, i64 1
+  %tmp22198 = getelementptr inbounds float* %tmp22197, i64 1
+  %tmp22199 = getelementptr inbounds float* %tmp22198, i64 1
+  %tmp22200 = getelementptr inbounds float* %tmp22199, i64 1
+  %tmp22201 = getelementptr inbounds float* %tmp22200, i64 1
+  %tmp22202 = getelementptr inbounds float* %tmp22201, i64 1
+  %tmp22203 = getelementptr inbounds float* %tmp22202, i64 1
+  %tmp22204 = getelementptr inbounds float* %tmp22203, i64 1
+  %tmp22205 = getelementptr inbounds float* %tmp22204, i64 1
+  %tmp22206 = getelementptr inbounds float* %tmp22205, i64 1
+  %tmp22207 = getelementptr inbounds float* %tmp22206, i64 1
+  %tmp22208 = getelementptr inbounds float* %tmp22207, i64 1
+  %tmp22209 = getelementptr inbounds float* %tmp22208, i64 1
+  %tmp22210 = getelementptr inbounds float* %tmp22209, i64 1
+  %tmp22211 = getelementptr inbounds float* %tmp22210, i64 1
+  %tmp22212 = getelementptr inbounds float* %tmp22211, i64 1
+  %tmp22213 = getelementptr inbounds float* %tmp22212, i64 1
+  %tmp22214 = getelementptr inbounds float* %tmp22213, i64 1
+  %tmp22215 = getelementptr inbounds float* %tmp22214, i64 1
+  %tmp22216 = getelementptr inbounds float* %tmp22215, i64 1
+  %tmp22217 = getelementptr inbounds float* %tmp22216, i64 1
+  %tmp22218 = getelementptr inbounds float* %tmp22217, i64 1
+  %tmp22219 = getelementptr inbounds float* %tmp22218, i64 1
+  %tmp22220 = getelementptr inbounds float* %tmp22219, i64 1
+  %tmp22221 = getelementptr inbounds float* %tmp22220, i64 1
+  %tmp22222 = getelementptr inbounds float* %tmp22221, i64 1
+  %tmp22223 = getelementptr inbounds float* %tmp22222, i64 1
+  %tmp22224 = getelementptr inbounds float* %tmp22223, i64 1
+  %tmp22225 = getelementptr inbounds float* %tmp22224, i64 1
+  %tmp22226 = getelementptr inbounds float* %tmp22225, i64 1
+  %tmp22227 = getelementptr inbounds float* %tmp22226, i64 1
+  %tmp22228 = getelementptr inbounds float* %tmp22227, i64 1
+  %tmp22229 = getelementptr inbounds float* %tmp22228, i64 1
+  %tmp22230 = getelementptr inbounds float* %tmp22229, i64 1
+  %tmp22231 = getelementptr inbounds float* %tmp22230, i64 1
+  %tmp22232 = getelementptr inbounds float* %tmp22231, i64 1
+  %tmp22233 = getelementptr inbounds float* %tmp22232, i64 1
+  %tmp22234 = getelementptr inbounds float* %tmp22233, i64 1
+  %tmp22235 = getelementptr inbounds float* %tmp22234, i64 1
+  %tmp22236 = getelementptr inbounds float* %tmp22235, i64 1
+  %tmp22237 = getelementptr inbounds float* %tmp22236, i64 1
+  %tmp22238 = getelementptr inbounds float* %tmp22237, i64 1
+  %tmp22239 = getelementptr inbounds float* %tmp22238, i64 1
+  %tmp22240 = getelementptr inbounds float* %tmp22239, i64 1
+  %tmp22241 = getelementptr inbounds float* %tmp22240, i64 1
+  %tmp22242 = getelementptr inbounds float* %tmp22241, i64 1
+  %tmp22243 = getelementptr inbounds float* %tmp22242, i64 1
+  %tmp22244 = getelementptr inbounds float* %tmp22243, i64 1
+  %tmp22245 = getelementptr inbounds float* %tmp22244, i64 1
+  %tmp22246 = getelementptr inbounds float* %tmp22245, i64 1
+  %tmp22247 = getelementptr inbounds float* %tmp22246, i64 1
+  %tmp22248 = getelementptr inbounds float* %tmp22247, i64 1
+  %tmp22249 = getelementptr inbounds float* %tmp22248, i64 1
+  %tmp22250 = getelementptr inbounds float* %tmp22249, i64 1
+  %tmp22251 = getelementptr inbounds float* %tmp22250, i64 1
+  %tmp22252 = getelementptr inbounds float* %tmp22251, i64 1
+  %tmp22253 = getelementptr inbounds float* %tmp22252, i64 1
+  %tmp22254 = getelementptr inbounds float* %tmp22253, i64 1
+  %tmp22255 = getelementptr inbounds float* %tmp22254, i64 1
+  %tmp22256 = getelementptr inbounds float* %tmp22255, i64 1
+  %tmp22257 = getelementptr inbounds float* %tmp22256, i64 1
+  %tmp22258 = getelementptr inbounds float* %tmp22257, i64 1
+  %tmp22259 = getelementptr inbounds float* %tmp22258, i64 1
+  %tmp22260 = getelementptr inbounds float* %tmp22259, i64 1
+  %tmp22261 = getelementptr inbounds float* %tmp22260, i64 1
+  %tmp22262 = getelementptr inbounds float* %tmp22261, i64 1
+  %tmp22263 = getelementptr inbounds float* %tmp22262, i64 1
+  %tmp22264 = getelementptr inbounds float* %tmp22263, i64 1
+  %tmp22265 = getelementptr inbounds float* %tmp22264, i64 1
+  %tmp22266 = getelementptr inbounds float* %tmp22265, i64 1
+  %tmp22267 = getelementptr inbounds float* %tmp22266, i64 1
+  %tmp22268 = getelementptr inbounds float* %tmp22267, i64 1
+  %tmp22269 = getelementptr inbounds float* %tmp22268, i64 1
+  %tmp22270 = getelementptr inbounds float* %tmp22269, i64 1
+  %tmp22271 = getelementptr inbounds float* %tmp22270, i64 1
+  %tmp22272 = getelementptr inbounds float* %tmp22271, i64 1
+  %tmp22273 = getelementptr inbounds float* %tmp22272, i64 1
+  %tmp22274 = getelementptr inbounds float* %tmp22273, i64 1
+  %tmp22275 = getelementptr inbounds float* %tmp22274, i64 1
+  %tmp22276 = getelementptr inbounds float* %tmp22275, i64 1
+  %tmp22277 = getelementptr inbounds float* %tmp22276, i64 1
+  %tmp22278 = getelementptr inbounds float* %tmp22277, i64 1
+  %tmp22279 = getelementptr inbounds float* %tmp22278, i64 1
+  %tmp22280 = getelementptr inbounds float* %tmp22279, i64 1
+  %tmp22281 = getelementptr inbounds float* %tmp22280, i64 1
+  %tmp22282 = getelementptr inbounds float* %tmp22281, i64 1
+  %tmp22283 = getelementptr inbounds float* %tmp22282, i64 1
+  %tmp22284 = getelementptr inbounds float* %tmp22283, i64 1
+  %tmp22285 = getelementptr inbounds float* %tmp22284, i64 1
+  %tmp22286 = getelementptr inbounds float* %tmp22285, i64 1
+  %tmp22287 = getelementptr inbounds float* %tmp22286, i64 1
+  %tmp22288 = getelementptr inbounds float* %tmp22287, i64 1
+  %tmp22289 = getelementptr inbounds float* %tmp22288, i64 1
+  %tmp22290 = getelementptr inbounds float* %tmp22289, i64 1
+  %tmp22291 = getelementptr inbounds float* %tmp22290, i64 1
+  %tmp22292 = getelementptr inbounds float* %tmp22291, i64 1
+  %tmp22293 = getelementptr inbounds float* %tmp22292, i64 1
+  %tmp22294 = getelementptr inbounds float* %tmp22293, i64 1
+  %tmp22295 = getelementptr inbounds float* %tmp22294, i64 1
+  %tmp22296 = getelementptr inbounds float* %tmp22295, i64 1
+  %tmp22297 = getelementptr inbounds float* %tmp22296, i64 1
+  %tmp22298 = getelementptr inbounds float* %tmp22297, i64 1
+  %tmp22299 = getelementptr inbounds float* %tmp22298, i64 1
+  %tmp22300 = getelementptr inbounds float* %tmp22299, i64 1
+  %tmp22301 = getelementptr inbounds float* %tmp22300, i64 1
+  %tmp22302 = getelementptr inbounds float* %tmp22301, i64 1
+  %tmp22303 = getelementptr inbounds float* %tmp22302, i64 1
+  %tmp22304 = getelementptr inbounds float* %tmp22303, i64 1
+  %tmp22305 = getelementptr inbounds float* %tmp22304, i64 1
+  %tmp22306 = getelementptr inbounds float* %tmp22305, i64 1
+  %tmp22307 = getelementptr inbounds float* %tmp22306, i64 1
+  %tmp22308 = getelementptr inbounds float* %tmp22307, i64 1
+  %tmp22309 = getelementptr inbounds float* %tmp22308, i64 1
+  %tmp22310 = getelementptr inbounds float* %tmp22309, i64 1
+  %tmp22311 = getelementptr inbounds float* %tmp22310, i64 1
+  %tmp22312 = getelementptr inbounds float* %tmp22311, i64 1
+  %tmp22313 = getelementptr inbounds float* %tmp22312, i64 1
+  %tmp22314 = getelementptr inbounds float* %tmp22313, i64 1
+  %tmp22315 = getelementptr inbounds float* %tmp22314, i64 1
+  %tmp22316 = getelementptr inbounds float* %tmp22315, i64 1
+  %tmp22317 = getelementptr inbounds float* %tmp22316, i64 1
+  %tmp22318 = getelementptr inbounds float* %tmp22317, i64 1
+  %tmp22319 = getelementptr inbounds float* %tmp22318, i64 1
+  %tmp22320 = getelementptr inbounds float* %tmp22319, i64 1
+  %tmp22321 = getelementptr inbounds float* %tmp22320, i64 1
+  %tmp22322 = getelementptr inbounds float* %tmp22321, i64 1
+  %tmp22323 = getelementptr inbounds float* %tmp22322, i64 1
+  %tmp22324 = getelementptr inbounds float* %tmp22323, i64 1
+  %tmp22325 = getelementptr inbounds float* %tmp22324, i64 1
+  %tmp22326 = getelementptr inbounds float* %tmp22325, i64 1
+  %tmp22327 = getelementptr inbounds float* %tmp22326, i64 1
+  %tmp22328 = getelementptr inbounds float* %tmp22327, i64 1
+  %tmp22329 = getelementptr inbounds float* %tmp22328, i64 1
+  %tmp22330 = getelementptr inbounds float* %tmp22329, i64 1
+  %tmp22331 = getelementptr inbounds float* %tmp22330, i64 1
+  %tmp22332 = getelementptr inbounds float* %tmp22331, i64 1
+  %tmp22333 = getelementptr inbounds float* %tmp22332, i64 1
+  %tmp22334 = getelementptr inbounds float* %tmp22333, i64 1
+  %tmp22335 = getelementptr inbounds float* %tmp22334, i64 1
+  %tmp22336 = getelementptr inbounds float* %tmp22335, i64 1
+  %tmp22337 = getelementptr inbounds float* %tmp22336, i64 1
+  %tmp22338 = getelementptr inbounds float* %tmp22337, i64 1
+  %tmp22339 = getelementptr inbounds float* %tmp22338, i64 1
+  %tmp22340 = getelementptr inbounds float* %tmp22339, i64 1
+  %tmp22341 = getelementptr inbounds float* %tmp22340, i64 1
+  %tmp22342 = getelementptr inbounds float* %tmp22341, i64 1
+  %tmp22343 = getelementptr inbounds float* %tmp22342, i64 1
+  %tmp22344 = getelementptr inbounds float* %tmp22343, i64 1
+  %tmp22345 = getelementptr inbounds float* %tmp22344, i64 1
+  %tmp22346 = getelementptr inbounds float* %tmp22345, i64 1
+  %tmp22347 = getelementptr inbounds float* %tmp22346, i64 1
+  %tmp22348 = getelementptr inbounds float* %tmp22347, i64 1
+  %tmp22349 = getelementptr inbounds float* %tmp22348, i64 1
+  %tmp22350 = getelementptr inbounds float* %tmp22349, i64 1
+  %tmp22351 = getelementptr inbounds float* %tmp22350, i64 1
+  %tmp22352 = getelementptr inbounds float* %tmp22351, i64 1
+  %tmp22353 = getelementptr inbounds float* %tmp22352, i64 1
+  %tmp22354 = getelementptr inbounds float* %tmp22353, i64 1
+  %tmp22355 = getelementptr inbounds float* %tmp22354, i64 1
+  %tmp22356 = getelementptr inbounds float* %tmp22355, i64 1
+  %tmp22357 = getelementptr inbounds float* %tmp22356, i64 1
+  %tmp22358 = getelementptr inbounds float* %tmp22357, i64 1
+  %tmp22359 = getelementptr inbounds float* %tmp22358, i64 1
+  %tmp22360 = getelementptr inbounds float* %tmp22359, i64 1
+  %tmp22361 = getelementptr inbounds float* %tmp22360, i64 1
+  %tmp22362 = getelementptr inbounds float* %tmp22361, i64 1
+  %tmp22363 = getelementptr inbounds float* %tmp22362, i64 1
+  %tmp22364 = getelementptr inbounds float* %tmp22363, i64 1
+  %tmp22365 = getelementptr inbounds float* %tmp22364, i64 1
+  %tmp22366 = getelementptr inbounds float* %tmp22365, i64 1
+  %tmp22367 = getelementptr inbounds float* %tmp22366, i64 1
+  %tmp22368 = getelementptr inbounds float* %tmp22367, i64 1
+  %tmp22369 = getelementptr inbounds float* %tmp22368, i64 1
+  %tmp22370 = getelementptr inbounds float* %tmp22369, i64 1
+  %tmp22371 = getelementptr inbounds float* %tmp22370, i64 1
+  %tmp22372 = getelementptr inbounds float* %tmp22371, i64 1
+  %tmp22373 = getelementptr inbounds float* %tmp22372, i64 1
+  %tmp22374 = getelementptr inbounds float* %tmp22373, i64 1
+  %tmp22375 = getelementptr inbounds float* %tmp22374, i64 1
+  %tmp22376 = getelementptr inbounds float* %tmp22375, i64 1
+  %tmp22377 = getelementptr inbounds float* %tmp22376, i64 1
+  %tmp22378 = getelementptr inbounds float* %tmp22377, i64 1
+  %tmp22379 = getelementptr inbounds float* %tmp22378, i64 1
+  %tmp22380 = getelementptr inbounds float* %tmp22379, i64 1
+  %tmp22381 = getelementptr inbounds float* %tmp22380, i64 1
+  %tmp22382 = getelementptr inbounds float* %tmp22381, i64 1
+  %tmp22383 = getelementptr inbounds float* %tmp22382, i64 1
+  %tmp22384 = getelementptr inbounds float* %tmp22383, i64 1
+  %tmp22385 = getelementptr inbounds float* %tmp22384, i64 1
+  %tmp22386 = getelementptr inbounds float* %tmp22385, i64 1
+  %tmp22387 = getelementptr inbounds float* %tmp22386, i64 1
+  %tmp22388 = getelementptr inbounds float* %tmp22387, i64 1
+  %tmp22389 = getelementptr inbounds float* %tmp22388, i64 1
+  %tmp22390 = getelementptr inbounds float* %tmp22389, i64 1
+  %tmp22391 = getelementptr inbounds float* %tmp22390, i64 1
+  %tmp22392 = getelementptr inbounds float* %tmp22391, i64 1
+  %tmp22393 = getelementptr inbounds float* %tmp22392, i64 1
+  %tmp22394 = getelementptr inbounds float* %tmp22393, i64 1
+  %tmp22395 = getelementptr inbounds float* %tmp22394, i64 1
+  %tmp22396 = getelementptr inbounds float* %tmp22395, i64 1
+  %tmp22397 = getelementptr inbounds float* %tmp22396, i64 1
+  %tmp22398 = getelementptr inbounds float* %tmp22397, i64 1
+  %tmp22399 = getelementptr inbounds float* %tmp22398, i64 1
+  %tmp22400 = getelementptr inbounds float* %tmp22399, i64 1
+  %tmp22401 = getelementptr inbounds float* %tmp22400, i64 1
+  %tmp22402 = getelementptr inbounds float* %tmp22401, i64 1
+  %tmp22403 = getelementptr inbounds float* %tmp22402, i64 1
+  %tmp22404 = getelementptr inbounds float* %tmp22403, i64 1
+  %tmp22405 = getelementptr inbounds float* %tmp22404, i64 1
+  %tmp22406 = getelementptr inbounds float* %tmp22405, i64 1
+  %tmp22407 = getelementptr inbounds float* %tmp22406, i64 1
+  %tmp22408 = getelementptr inbounds float* %tmp22407, i64 1
+  %tmp22409 = getelementptr inbounds float* %tmp22408, i64 1
+  %tmp22410 = getelementptr inbounds float* %tmp22409, i64 1
+  %tmp22411 = getelementptr inbounds float* %tmp22410, i64 1
+  %tmp22412 = getelementptr inbounds float* %tmp22411, i64 1
+  %tmp22413 = getelementptr inbounds float* %tmp22412, i64 1
+  %tmp22414 = getelementptr inbounds float* %tmp22413, i64 1
+  %tmp22415 = getelementptr inbounds float* %tmp22414, i64 1
+  %tmp22416 = getelementptr inbounds float* %tmp22415, i64 1
+  %tmp22417 = getelementptr inbounds float* %tmp22416, i64 1
+  %tmp22418 = getelementptr inbounds float* %tmp22417, i64 1
+  %tmp22419 = getelementptr inbounds float* %tmp22418, i64 1
+  %tmp22420 = getelementptr inbounds float* %tmp22419, i64 1
+  %tmp22421 = getelementptr inbounds float* %tmp22420, i64 1
+  %tmp22422 = getelementptr inbounds float* %tmp22421, i64 1
+  %tmp22423 = getelementptr inbounds float* %tmp22422, i64 1
+  %tmp22424 = getelementptr inbounds float* %tmp22423, i64 1
+  %tmp22425 = getelementptr inbounds float* %tmp22424, i64 1
+  %tmp22426 = getelementptr inbounds float* %tmp22425, i64 1
+  %tmp22427 = getelementptr inbounds float* %tmp22426, i64 1
+  %tmp22428 = getelementptr inbounds float* %tmp22427, i64 1
+  %tmp22429 = getelementptr inbounds float* %tmp22428, i64 1
+  %tmp22430 = getelementptr inbounds float* %tmp22429, i64 1
+  %tmp22431 = getelementptr inbounds float* %tmp22430, i64 1
+  %tmp22432 = getelementptr inbounds float* %tmp22431, i64 1
+  %tmp22433 = getelementptr inbounds float* %tmp22432, i64 1
+  %tmp22434 = getelementptr inbounds float* %tmp22433, i64 1
+  %tmp22435 = getelementptr inbounds float* %tmp22434, i64 1
+  %tmp22436 = getelementptr inbounds float* %tmp22435, i64 1
+  %tmp22437 = getelementptr inbounds float* %tmp22436, i64 1
+  %tmp22438 = getelementptr inbounds float* %tmp22437, i64 1
+  %tmp22439 = getelementptr inbounds float* %tmp22438, i64 1
+  %tmp22440 = getelementptr inbounds float* %tmp22439, i64 1
+  %tmp22441 = getelementptr inbounds float* %tmp22440, i64 1
+  %tmp22442 = getelementptr inbounds float* %tmp22441, i64 1
+  %tmp22443 = getelementptr inbounds float* %tmp22442, i64 1
+  %tmp22444 = getelementptr inbounds float* %tmp22443, i64 1
+  %tmp22445 = getelementptr inbounds float* %tmp22444, i64 1
+  %tmp22446 = getelementptr inbounds float* %tmp22445, i64 1
+  %tmp22447 = getelementptr inbounds float* %tmp22446, i64 1
+  %tmp22448 = getelementptr inbounds float* %tmp22447, i64 1
+  %tmp22449 = getelementptr inbounds float* %tmp22448, i64 1
+  %tmp22450 = getelementptr inbounds float* %tmp22449, i64 1
+  %tmp22451 = getelementptr inbounds float* %tmp22450, i64 1
+  %tmp22452 = getelementptr inbounds float* %tmp22451, i64 1
+  %tmp22453 = getelementptr inbounds float* %tmp22452, i64 1
+  %tmp22454 = getelementptr inbounds float* %tmp22453, i64 1
+  %tmp22455 = getelementptr inbounds float* %tmp22454, i64 1
+  %tmp22456 = getelementptr inbounds float* %tmp22455, i64 1
+  %tmp22457 = getelementptr inbounds float* %tmp22456, i64 1
+  %tmp22458 = getelementptr inbounds float* %tmp22457, i64 1
+  %tmp22459 = getelementptr inbounds float* %tmp22458, i64 1
+  %tmp22460 = getelementptr inbounds float* %tmp22459, i64 1
+  %tmp22461 = getelementptr inbounds float* %tmp22460, i64 1
+  %tmp22462 = getelementptr inbounds float* %tmp22461, i64 1
+  %tmp22463 = getelementptr inbounds float* %tmp22462, i64 1
+  %tmp22464 = getelementptr inbounds float* %tmp22463, i64 1
+  %tmp22465 = getelementptr inbounds float* %tmp22464, i64 1
+  %tmp22466 = getelementptr inbounds float* %tmp22465, i64 1
+  %tmp22467 = getelementptr inbounds float* %tmp22466, i64 1
+  %tmp22468 = getelementptr inbounds float* %tmp22467, i64 1
+  %tmp22469 = getelementptr inbounds float* %tmp22468, i64 1
+  %tmp22470 = getelementptr inbounds float* %tmp22469, i64 1
+  %tmp22471 = getelementptr inbounds float* %tmp22470, i64 1
+  %tmp22472 = getelementptr inbounds float* %tmp22471, i64 1
+  %tmp22473 = getelementptr inbounds float* %tmp22472, i64 1
+  %tmp22474 = getelementptr inbounds float* %tmp22473, i64 1
+  %tmp22475 = getelementptr inbounds float* %tmp22474, i64 1
+  %tmp22476 = getelementptr inbounds float* %tmp22475, i64 1
+  %tmp22477 = getelementptr inbounds float* %tmp22476, i64 1
+  %tmp22478 = getelementptr inbounds float* %tmp22477, i64 1
+  %tmp22479 = getelementptr inbounds float* %tmp22478, i64 1
+  %tmp22480 = getelementptr inbounds float* %tmp22479, i64 1
+  %tmp22481 = getelementptr inbounds float* %tmp22480, i64 1
+  %tmp22482 = getelementptr inbounds float* %tmp22481, i64 1
+  %tmp22483 = getelementptr inbounds float* %tmp22482, i64 1
+  %tmp22484 = getelementptr inbounds float* %tmp22483, i64 1
+  %tmp22485 = getelementptr inbounds float* %tmp22484, i64 1
+  %tmp22486 = getelementptr inbounds float* %tmp22485, i64 1
+  %tmp22487 = getelementptr inbounds float* %tmp22486, i64 1
+  %tmp22488 = getelementptr inbounds float* %tmp22487, i64 1
+  %tmp22489 = getelementptr inbounds float* %tmp22488, i64 1
+  %tmp22490 = getelementptr inbounds float* %tmp22489, i64 1
+  %tmp22491 = getelementptr inbounds float* %tmp22490, i64 1
+  %tmp22492 = getelementptr inbounds float* %tmp22491, i64 1
+  %tmp22493 = getelementptr inbounds float* %tmp22492, i64 1
+  %tmp22494 = getelementptr inbounds float* %tmp22493, i64 1
+  %tmp22495 = getelementptr inbounds float* %tmp22494, i64 1
+  %tmp22496 = getelementptr inbounds float* %tmp22495, i64 1
+  %tmp22497 = getelementptr inbounds float* %tmp22496, i64 1
+  %tmp22498 = getelementptr inbounds float* %tmp22497, i64 1
+  %tmp22499 = getelementptr inbounds float* %tmp22498, i64 1
+  %tmp22500 = getelementptr inbounds float* %tmp22499, i64 1
+  %tmp22501 = getelementptr inbounds float* %tmp22500, i64 1
+  %tmp22502 = getelementptr inbounds float* %tmp22501, i64 1
+  %tmp22503 = getelementptr inbounds float* %tmp22502, i64 1
+  %tmp22504 = getelementptr inbounds float* %tmp22503, i64 1
+  %tmp22505 = getelementptr inbounds float* %tmp22504, i64 1
+  %tmp22506 = getelementptr inbounds float* %tmp22505, i64 1
+  %tmp22507 = getelementptr inbounds float* %tmp22506, i64 1
+  %tmp22508 = getelementptr inbounds float* %tmp22507, i64 1
+  %tmp22509 = getelementptr inbounds float* %tmp22508, i64 1
+  %tmp22510 = getelementptr inbounds float* %tmp22509, i64 1
+  %tmp22511 = getelementptr inbounds float* %tmp22510, i64 1
+  %tmp22512 = getelementptr inbounds float* %tmp22511, i64 1
+  %tmp22513 = getelementptr inbounds float* %tmp22512, i64 1
+  %tmp22514 = getelementptr inbounds float* %tmp22513, i64 1
+  %tmp22515 = getelementptr inbounds float* %tmp22514, i64 1
+  %tmp22516 = getelementptr inbounds float* %tmp22515, i64 1
+  %tmp22517 = getelementptr inbounds float* %tmp22516, i64 1
+  %tmp22518 = getelementptr inbounds float* %tmp22517, i64 1
+  %tmp22519 = getelementptr inbounds float* %tmp22518, i64 1
+  %tmp22520 = getelementptr inbounds float* %tmp22519, i64 1
+  %tmp22521 = getelementptr inbounds float* %tmp22520, i64 1
+  %tmp22522 = getelementptr inbounds float* %tmp22521, i64 1
+  %tmp22523 = getelementptr inbounds float* %tmp22522, i64 1
+  %tmp22524 = getelementptr inbounds float* %tmp22523, i64 1
+  %tmp22525 = getelementptr inbounds float* %tmp22524, i64 1
+  %tmp22526 = getelementptr inbounds float* %tmp22525, i64 1
+  %tmp22527 = getelementptr inbounds float* %tmp22526, i64 1
+  %tmp22528 = getelementptr inbounds float* %tmp22527, i64 1
+  %tmp22529 = getelementptr inbounds float* %tmp22528, i64 1
+  %tmp22530 = getelementptr inbounds float* %tmp22529, i64 1
+  %tmp22531 = getelementptr inbounds float* %tmp22530, i64 1
+  %tmp22532 = getelementptr inbounds float* %tmp22531, i64 1
+  %tmp22533 = getelementptr inbounds float* %tmp22532, i64 1
+  %tmp22534 = getelementptr inbounds float* %tmp22533, i64 1
+  %tmp22535 = getelementptr inbounds float* %tmp22534, i64 1
+  %tmp22536 = getelementptr inbounds float* %tmp22535, i64 1
+  %tmp22537 = getelementptr inbounds float* %tmp22536, i64 1
+  %tmp22538 = getelementptr inbounds float* %tmp22537, i64 1
+  %tmp22539 = getelementptr inbounds float* %tmp22538, i64 1
+  %tmp22540 = getelementptr inbounds float* %tmp22539, i64 1
+  %tmp22541 = getelementptr inbounds float* %tmp22540, i64 1
+  %tmp22542 = getelementptr inbounds float* %tmp22541, i64 1
+  %tmp22543 = getelementptr inbounds float* %tmp22542, i64 1
+  %tmp22544 = getelementptr inbounds float* %tmp22543, i64 1
+  %tmp22545 = getelementptr inbounds float* %tmp22544, i64 1
+  %tmp22546 = getelementptr inbounds float* %tmp22545, i64 1
+  %tmp22547 = getelementptr inbounds float* %tmp22546, i64 1
+  %tmp22548 = getelementptr inbounds float* %tmp22547, i64 1
+  %tmp22549 = getelementptr inbounds float* %tmp22548, i64 1
+  %tmp22550 = getelementptr inbounds float* %tmp22549, i64 1
+  %tmp22551 = getelementptr inbounds float* %tmp22550, i64 1
+  %tmp22552 = getelementptr inbounds float* %tmp22551, i64 1
+  %tmp22553 = getelementptr inbounds float* %tmp22552, i64 1
+  %tmp22554 = getelementptr inbounds float* %tmp22553, i64 1
+  %tmp22555 = getelementptr inbounds float* %tmp22554, i64 1
+  %tmp22556 = getelementptr inbounds float* %tmp22555, i64 1
+  %tmp22557 = getelementptr inbounds float* %tmp22556, i64 1
+  %tmp22558 = getelementptr inbounds float* %tmp22557, i64 1
+  %tmp22559 = getelementptr inbounds float* %tmp22558, i64 1
+  %tmp22560 = getelementptr inbounds float* %tmp22559, i64 1
+  %tmp22561 = getelementptr inbounds float* %tmp22560, i64 1
+  %tmp22562 = getelementptr inbounds float* %tmp22561, i64 1
+  %tmp22563 = getelementptr inbounds float* %tmp22562, i64 1
+  %tmp22564 = getelementptr inbounds float* %tmp22563, i64 1
+  %tmp22565 = getelementptr inbounds float* %tmp22564, i64 1
+  %tmp22566 = getelementptr inbounds float* %tmp22565, i64 1
+  %tmp22567 = getelementptr inbounds float* %tmp22566, i64 1
+  %tmp22568 = getelementptr inbounds float* %tmp22567, i64 1
+  %tmp22569 = getelementptr inbounds float* %tmp22568, i64 1
+  %tmp22570 = getelementptr inbounds float* %tmp22569, i64 1
+  %tmp22571 = getelementptr inbounds float* %tmp22570, i64 1
+  %tmp22572 = getelementptr inbounds float* %tmp22571, i64 1
+  %tmp22573 = getelementptr inbounds float* %tmp22572, i64 1
+  %tmp22574 = getelementptr inbounds float* %tmp22573, i64 1
+  %tmp22575 = getelementptr inbounds float* %tmp22574, i64 1
+  %tmp22576 = getelementptr inbounds float* %tmp22575, i64 1
+  %tmp22577 = getelementptr inbounds float* %tmp22576, i64 1
+  %tmp22578 = getelementptr inbounds float* %tmp22577, i64 1
+  %tmp22579 = getelementptr inbounds float* %tmp22578, i64 1
+  %tmp22580 = getelementptr inbounds float* %tmp22579, i64 1
+  %tmp22581 = getelementptr inbounds float* %tmp22580, i64 1
+  %tmp22582 = getelementptr inbounds float* %tmp22581, i64 1
+  %tmp22583 = getelementptr inbounds float* %tmp22582, i64 1
+  %tmp22584 = getelementptr inbounds float* %tmp22583, i64 1
+  %tmp22585 = getelementptr inbounds float* %tmp22584, i64 1
+  %tmp22586 = getelementptr inbounds float* %tmp22585, i64 1
+  %tmp22587 = getelementptr inbounds float* %tmp22586, i64 1
+  %tmp22588 = getelementptr inbounds float* %tmp22587, i64 1
+  %tmp22589 = getelementptr inbounds float* %tmp22588, i64 1
+  %tmp22590 = getelementptr inbounds float* %tmp22589, i64 1
+  %tmp22591 = getelementptr inbounds float* %tmp22590, i64 1
+  %tmp22592 = getelementptr inbounds float* %tmp22591, i64 1
+  %tmp22593 = getelementptr inbounds float* %tmp22592, i64 1
+  %tmp22594 = getelementptr inbounds float* %tmp22593, i64 1
+  %tmp22595 = getelementptr inbounds float* %tmp22594, i64 1
+  %tmp22596 = getelementptr inbounds float* %tmp22595, i64 1
+  %tmp22597 = getelementptr inbounds float* %tmp22596, i64 1
+  %tmp22598 = getelementptr inbounds float* %tmp22597, i64 1
+  %tmp22599 = getelementptr inbounds float* %tmp22598, i64 1
+  %tmp22600 = getelementptr inbounds float* %tmp22599, i64 1
+  %tmp22601 = getelementptr inbounds float* %tmp22600, i64 1
+  %tmp22602 = getelementptr inbounds float* %tmp22601, i64 1
+  %tmp22603 = getelementptr inbounds float* %tmp22602, i64 1
+  %tmp22604 = getelementptr inbounds float* %tmp22603, i64 1
+  %tmp22605 = getelementptr inbounds float* %tmp22604, i64 1
+  %tmp22606 = getelementptr inbounds float* %tmp22605, i64 1
+  %tmp22607 = getelementptr inbounds float* %tmp22606, i64 1
+  %tmp22608 = getelementptr inbounds float* %tmp22607, i64 1
+  %tmp22609 = getelementptr inbounds float* %tmp22608, i64 1
+  %tmp22610 = getelementptr inbounds float* %tmp22609, i64 1
+  %tmp22611 = getelementptr inbounds float* %tmp22610, i64 1
+  %tmp22612 = getelementptr inbounds float* %tmp22611, i64 1
+  %tmp22613 = getelementptr inbounds float* %tmp22612, i64 1
+  %tmp22614 = getelementptr inbounds float* %tmp22613, i64 1
+  %tmp22615 = getelementptr inbounds float* %tmp22614, i64 1
+  %tmp22616 = getelementptr inbounds float* %tmp22615, i64 1
+  %tmp22617 = getelementptr inbounds float* %tmp22616, i64 1
+  %tmp22618 = getelementptr inbounds float* %tmp22617, i64 1
+  %tmp22619 = getelementptr inbounds float* %tmp22618, i64 1
+  %tmp22620 = getelementptr inbounds float* %tmp22619, i64 1
+  %tmp22621 = getelementptr inbounds float* %tmp22620, i64 1
+  %tmp22622 = getelementptr inbounds float* %tmp22621, i64 1
+  %tmp22623 = getelementptr inbounds float* %tmp22622, i64 1
+  %tmp22624 = getelementptr inbounds float* %tmp22623, i64 1
+  %tmp22625 = getelementptr inbounds float* %tmp22624, i64 1
+  %tmp22626 = getelementptr inbounds float* %tmp22625, i64 1
+  %tmp22627 = getelementptr inbounds float* %tmp22626, i64 1
+  %tmp22628 = getelementptr inbounds float* %tmp22627, i64 1
+  %tmp22629 = getelementptr inbounds float* %tmp22628, i64 1
+  %tmp22630 = getelementptr inbounds float* %tmp22629, i64 1
+  %tmp22631 = getelementptr inbounds float* %tmp22630, i64 1
+  %tmp22632 = getelementptr inbounds float* %tmp22631, i64 1
+  %tmp22633 = getelementptr inbounds float* %tmp22632, i64 1
+  %tmp22634 = getelementptr inbounds float* %tmp22633, i64 1
+  %tmp22635 = getelementptr inbounds float* %tmp22634, i64 1
+  %tmp22636 = getelementptr inbounds float* %tmp22635, i64 1
+  %tmp22637 = getelementptr inbounds float* %tmp22636, i64 1
+  %tmp22638 = getelementptr inbounds float* %tmp22637, i64 1
+  %tmp22639 = getelementptr inbounds float* %tmp22638, i64 1
+  %tmp22640 = getelementptr inbounds float* %tmp22639, i64 1
+  %tmp22641 = getelementptr inbounds float* %tmp22640, i64 1
+  %tmp22642 = getelementptr inbounds float* %tmp22641, i64 1
+  %tmp22643 = getelementptr inbounds float* %tmp22642, i64 1
+  %tmp22644 = getelementptr inbounds float* %tmp22643, i64 1
+  %tmp22645 = getelementptr inbounds float* %tmp22644, i64 1
+  %tmp22646 = getelementptr inbounds float* %tmp22645, i64 1
+  %tmp22647 = getelementptr inbounds float* %tmp22646, i64 1
+  %tmp22648 = getelementptr inbounds float* %tmp22647, i64 1
+  %tmp22649 = getelementptr inbounds float* %tmp22648, i64 1
+  %tmp22650 = getelementptr inbounds float* %tmp22649, i64 1
+  %tmp22651 = getelementptr inbounds float* %tmp22650, i64 1
+  %tmp22652 = getelementptr inbounds float* %tmp22651, i64 1
+  %tmp22653 = getelementptr inbounds float* %tmp22652, i64 1
+  %tmp22654 = getelementptr inbounds float* %tmp22653, i64 1
+  %tmp22655 = getelementptr inbounds float* %tmp22654, i64 1
+  %tmp22656 = getelementptr inbounds float* %tmp22655, i64 1
+  %tmp22657 = getelementptr inbounds float* %tmp22656, i64 1
+  %tmp22658 = getelementptr inbounds float* %tmp22657, i64 1
+  %tmp22659 = getelementptr inbounds float* %tmp22658, i64 1
+  %tmp22660 = getelementptr inbounds float* %tmp22659, i64 1
+  %tmp22661 = getelementptr inbounds float* %tmp22660, i64 1
+  %tmp22662 = getelementptr inbounds float* %tmp22661, i64 1
+  %tmp22663 = getelementptr inbounds float* %tmp22662, i64 1
+  %tmp22664 = getelementptr inbounds float* %tmp22663, i64 1
+  %tmp22665 = getelementptr inbounds float* %tmp22664, i64 1
+  %tmp22666 = getelementptr inbounds float* %tmp22665, i64 1
+  %tmp22667 = getelementptr inbounds float* %tmp22666, i64 1
+  %tmp22668 = getelementptr inbounds float* %tmp22667, i64 1
+  %tmp22669 = getelementptr inbounds float* %tmp22668, i64 1
+  %tmp22670 = getelementptr inbounds float* %tmp22669, i64 1
+  %tmp22671 = getelementptr inbounds float* %tmp22670, i64 1
+  %tmp22672 = getelementptr inbounds float* %tmp22671, i64 1
+  %tmp22673 = getelementptr inbounds float* %tmp22672, i64 1
+  %tmp22674 = getelementptr inbounds float* %tmp22673, i64 1
+  %tmp22675 = getelementptr inbounds float* %tmp22674, i64 1
+  %tmp22676 = getelementptr inbounds float* %tmp22675, i64 1
+  %tmp22677 = getelementptr inbounds float* %tmp22676, i64 1
+  %tmp22678 = getelementptr inbounds float* %tmp22677, i64 1
+  %tmp22679 = getelementptr inbounds float* %tmp22678, i64 1
+  %tmp22680 = getelementptr inbounds float* %tmp22679, i64 1
+  %tmp22681 = getelementptr inbounds float* %tmp22680, i64 1
+  %tmp22682 = getelementptr inbounds float* %tmp22681, i64 1
+  %tmp22683 = getelementptr inbounds float* %tmp22682, i64 1
+  %tmp22684 = getelementptr inbounds float* %tmp22683, i64 1
+  %tmp22685 = getelementptr inbounds float* %tmp22684, i64 1
+  %tmp22686 = getelementptr inbounds float* %tmp22685, i64 1
+  %tmp22687 = getelementptr inbounds float* %tmp22686, i64 1
+  %tmp22688 = getelementptr inbounds float* %tmp22687, i64 1
+  %tmp22689 = getelementptr inbounds float* %tmp22688, i64 1
+  %tmp22690 = getelementptr inbounds float* %tmp22689, i64 1
+  %tmp22691 = getelementptr inbounds float* %tmp22690, i64 1
+  %tmp22692 = getelementptr inbounds float* %tmp22691, i64 1
+  %tmp22693 = getelementptr inbounds float* %tmp22692, i64 1
+  %tmp22694 = getelementptr inbounds float* %tmp22693, i64 1
+  %tmp22695 = getelementptr inbounds float* %tmp22694, i64 1
+  %tmp22696 = getelementptr inbounds float* %tmp22695, i64 1
+  %tmp22697 = getelementptr inbounds float* %tmp22696, i64 1
+  %tmp22698 = getelementptr inbounds float* %tmp22697, i64 1
+  %tmp22699 = getelementptr inbounds float* %tmp22698, i64 1
+  %tmp22700 = getelementptr inbounds float* %tmp22699, i64 1
+  %tmp22701 = getelementptr inbounds float* %tmp22700, i64 1
+  %tmp22702 = getelementptr inbounds float* %tmp22701, i64 1
+  %tmp22703 = getelementptr inbounds float* %tmp22702, i64 1
+  %tmp22704 = getelementptr inbounds float* %tmp22703, i64 1
+  %tmp22705 = getelementptr inbounds float* %tmp22704, i64 1
+  %tmp22706 = getelementptr inbounds float* %tmp22705, i64 1
+  %tmp22707 = getelementptr inbounds float* %tmp22706, i64 1
+  %tmp22708 = getelementptr inbounds float* %tmp22707, i64 1
+  %tmp22709 = getelementptr inbounds float* %tmp22708, i64 1
+  %tmp22710 = getelementptr inbounds float* %tmp22709, i64 1
+  %tmp22711 = getelementptr inbounds float* %tmp22710, i64 1
+  %tmp22712 = getelementptr inbounds float* %tmp22711, i64 1
+  %tmp22713 = getelementptr inbounds float* %tmp22712, i64 1
+  %tmp22714 = getelementptr inbounds float* %tmp22713, i64 1
+  %tmp22715 = getelementptr inbounds float* %tmp22714, i64 1
+  %tmp22716 = getelementptr inbounds float* %tmp22715, i64 1
+  %tmp22717 = getelementptr inbounds float* %tmp22716, i64 1
+  %tmp22718 = getelementptr inbounds float* %tmp22717, i64 1
+  %tmp22719 = getelementptr inbounds float* %tmp22718, i64 1
+  %tmp22720 = getelementptr inbounds float* %tmp22719, i64 1
+  %tmp22721 = getelementptr inbounds float* %tmp22720, i64 1
+  %tmp22722 = getelementptr inbounds float* %tmp22721, i64 1
+  %tmp22723 = getelementptr inbounds float* %tmp22722, i64 1
+  %tmp22724 = getelementptr inbounds float* %tmp22723, i64 1
+  %tmp22725 = getelementptr inbounds float* %tmp22724, i64 1
+  %tmp22726 = getelementptr inbounds float* %tmp22725, i64 1
+  %tmp22727 = getelementptr inbounds float* %tmp22726, i64 1
+  %tmp22728 = getelementptr inbounds float* %tmp22727, i64 1
+  %tmp22729 = getelementptr inbounds float* %tmp22728, i64 1
+  %tmp22730 = getelementptr inbounds float* %tmp22729, i64 1
+  %tmp22731 = getelementptr inbounds float* %tmp22730, i64 1
+  %tmp22732 = getelementptr inbounds float* %tmp22731, i64 1
+  %tmp22733 = getelementptr inbounds float* %tmp22732, i64 1
+  %tmp22734 = getelementptr inbounds float* %tmp22733, i64 1
+  %tmp22735 = getelementptr inbounds float* %tmp22734, i64 1
+  %tmp22736 = getelementptr inbounds float* %tmp22735, i64 1
+  %tmp22737 = getelementptr inbounds float* %tmp22736, i64 1
+  %tmp22738 = getelementptr inbounds float* %tmp22737, i64 1
+  %tmp22739 = getelementptr inbounds float* %tmp22738, i64 1
+  %tmp22740 = getelementptr inbounds float* %tmp22739, i64 1
+  %tmp22741 = getelementptr inbounds float* %tmp22740, i64 1
+  %tmp22742 = getelementptr inbounds float* %tmp22741, i64 1
+  %tmp22743 = getelementptr inbounds float* %tmp22742, i64 1
+  %tmp22744 = getelementptr inbounds float* %tmp22743, i64 1
+  %tmp22745 = getelementptr inbounds float* %tmp22744, i64 1
+  %tmp22746 = getelementptr inbounds float* %tmp22745, i64 1
+  %tmp22747 = getelementptr inbounds float* %tmp22746, i64 1
+  %tmp22748 = getelementptr inbounds float* %tmp22747, i64 1
+  %tmp22749 = getelementptr inbounds float* %tmp22748, i64 1
+  %tmp22750 = getelementptr inbounds float* %tmp22749, i64 1
+  %tmp22751 = getelementptr inbounds float* %tmp22750, i64 1
+  %tmp22752 = getelementptr inbounds float* %tmp22751, i64 1
+  %tmp22753 = getelementptr inbounds float* %tmp22752, i64 1
+  %tmp22754 = getelementptr inbounds float* %tmp22753, i64 1
+  %tmp22755 = getelementptr inbounds float* %tmp22754, i64 1
+  %tmp22756 = getelementptr inbounds float* %tmp22755, i64 1
+  %tmp22757 = getelementptr inbounds float* %tmp22756, i64 1
+  %tmp22758 = getelementptr inbounds float* %tmp22757, i64 1
+  %tmp22759 = getelementptr inbounds float* %tmp22758, i64 1
+  %tmp22760 = getelementptr inbounds float* %tmp22759, i64 1
+  %tmp22761 = getelementptr inbounds float* %tmp22760, i64 1
+  %tmp22762 = getelementptr inbounds float* %tmp22761, i64 1
+  %tmp22763 = getelementptr inbounds float* %tmp22762, i64 1
+  %tmp22764 = getelementptr inbounds float* %tmp22763, i64 1
+  %tmp22765 = getelementptr inbounds float* %tmp22764, i64 1
+  %tmp22766 = getelementptr inbounds float* %tmp22765, i64 1
+  %tmp22767 = getelementptr inbounds float* %tmp22766, i64 1
+  %tmp22768 = getelementptr inbounds float* %tmp22767, i64 1
+  %tmp22769 = getelementptr inbounds float* %tmp22768, i64 1
+  %tmp22770 = getelementptr inbounds float* %tmp22769, i64 1
+  %tmp22771 = getelementptr inbounds float* %tmp22770, i64 1
+  %tmp22772 = getelementptr inbounds float* %tmp22771, i64 1
+  %tmp22773 = getelementptr inbounds float* %tmp22772, i64 1
+  %tmp22774 = getelementptr inbounds float* %tmp22773, i64 1
+  %tmp22775 = getelementptr inbounds float* %tmp22774, i64 1
+  %tmp22776 = getelementptr inbounds float* %tmp22775, i64 1
+  %tmp22777 = getelementptr inbounds float* %tmp22776, i64 1
+  %tmp22778 = getelementptr inbounds float* %tmp22777, i64 1
+  %tmp22779 = getelementptr inbounds float* %tmp22778, i64 1
+  %tmp22780 = getelementptr inbounds float* %tmp22779, i64 1
+  %tmp22781 = getelementptr inbounds float* %tmp22780, i64 1
+  %tmp22782 = getelementptr inbounds float* %tmp22781, i64 1
+  %tmp22783 = getelementptr inbounds float* %tmp22782, i64 1
+  %tmp22784 = getelementptr inbounds float* %tmp22783, i64 1
+  %tmp22785 = getelementptr inbounds float* %tmp22784, i64 1
+  %tmp22786 = getelementptr inbounds float* %tmp22785, i64 1
+  %tmp22787 = getelementptr inbounds float* %tmp22786, i64 1
+  %tmp22788 = getelementptr inbounds float* %tmp22787, i64 1
+  %tmp22789 = getelementptr inbounds float* %tmp22788, i64 1
+  %tmp22790 = getelementptr inbounds float* %tmp22789, i64 1
+  %tmp22791 = getelementptr inbounds float* %tmp22790, i64 1
+  %tmp22792 = getelementptr inbounds float* %tmp22791, i64 1
+  %tmp22793 = getelementptr inbounds float* %tmp22792, i64 1
+  %tmp22794 = getelementptr inbounds float* %tmp22793, i64 1
+  %tmp22795 = getelementptr inbounds float* %tmp22794, i64 1
+  %tmp22796 = getelementptr inbounds float* %tmp22795, i64 1
+  %tmp22797 = getelementptr inbounds float* %tmp22796, i64 1
+  %tmp22798 = getelementptr inbounds float* %tmp22797, i64 1
+  %tmp22799 = getelementptr inbounds float* %tmp22798, i64 1
+  %tmp22800 = getelementptr inbounds float* %tmp22799, i64 1
+  %tmp22801 = getelementptr inbounds float* %tmp22800, i64 1
+  %tmp22802 = getelementptr inbounds float* %tmp22801, i64 1
+  %tmp22803 = getelementptr inbounds float* %tmp22802, i64 1
+  %tmp22804 = getelementptr inbounds float* %tmp22803, i64 1
+  %tmp22805 = getelementptr inbounds float* %tmp22804, i64 1
+  %tmp22806 = getelementptr inbounds float* %tmp22805, i64 1
+  %tmp22807 = getelementptr inbounds float* %tmp22806, i64 1
+  %tmp22808 = getelementptr inbounds float* %tmp22807, i64 1
+  %tmp22809 = getelementptr inbounds float* %tmp22808, i64 1
+  %tmp22810 = getelementptr inbounds float* %tmp22809, i64 1
+  %tmp22811 = getelementptr inbounds float* %tmp22810, i64 1
+  %tmp22812 = getelementptr inbounds float* %tmp22811, i64 1
+  %tmp22813 = getelementptr inbounds float* %tmp22812, i64 1
+  %tmp22814 = getelementptr inbounds float* %tmp22813, i64 1
+  %tmp22815 = getelementptr inbounds float* %tmp22814, i64 1
+  %tmp22816 = getelementptr inbounds float* %tmp22815, i64 1
+  %tmp22817 = getelementptr inbounds float* %tmp22816, i64 1
+  %tmp22818 = getelementptr inbounds float* %tmp22817, i64 1
+  %tmp22819 = getelementptr inbounds float* %tmp22818, i64 1
+  %tmp22820 = getelementptr inbounds float* %tmp22819, i64 1
+  %tmp22821 = getelementptr inbounds float* %tmp22820, i64 1
+  %tmp22822 = getelementptr inbounds float* %tmp22821, i64 1
+  %tmp22823 = getelementptr inbounds float* %tmp22822, i64 1
+  %tmp22824 = getelementptr inbounds float* %tmp22823, i64 1
+  %tmp22825 = getelementptr inbounds float* %tmp22824, i64 1
+  %tmp22826 = getelementptr inbounds float* %tmp22825, i64 1
+  %tmp22827 = getelementptr inbounds float* %tmp22826, i64 1
+  %tmp22828 = getelementptr inbounds float* %tmp22827, i64 1
+  %tmp22829 = getelementptr inbounds float* %tmp22828, i64 1
+  %tmp22830 = getelementptr inbounds float* %tmp22829, i64 1
+  %tmp22831 = getelementptr inbounds float* %tmp22830, i64 1
+  %tmp22832 = getelementptr inbounds float* %tmp22831, i64 1
+  %tmp22833 = getelementptr inbounds float* %tmp22832, i64 1
+  %tmp22834 = getelementptr inbounds float* %tmp22833, i64 1
+  %tmp22835 = getelementptr inbounds float* %tmp22834, i64 1
+  %tmp22836 = getelementptr inbounds float* %tmp22835, i64 1
+  %tmp22837 = getelementptr inbounds float* %tmp22836, i64 1
+  %tmp22838 = getelementptr inbounds float* %tmp22837, i64 1
+  %tmp22839 = getelementptr inbounds float* %tmp22838, i64 1
+  %tmp22840 = getelementptr inbounds float* %tmp22839, i64 1
+  %tmp22841 = getelementptr inbounds float* %tmp22840, i64 1
+  %tmp22842 = getelementptr inbounds float* %tmp22841, i64 1
+  %tmp22843 = getelementptr inbounds float* %tmp22842, i64 1
+  %tmp22844 = getelementptr inbounds float* %tmp22843, i64 1
+  %tmp22845 = getelementptr inbounds float* %tmp22844, i64 1
+  %tmp22846 = getelementptr inbounds float* %tmp22845, i64 1
+  %tmp22847 = getelementptr inbounds float* %tmp22846, i64 1
+  %tmp22848 = getelementptr inbounds float* %tmp22847, i64 1
+  %tmp22849 = getelementptr inbounds float* %tmp22848, i64 1
+  %tmp22850 = getelementptr inbounds float* %tmp22849, i64 1
+  %tmp22851 = getelementptr inbounds float* %tmp22850, i64 1
+  %tmp22852 = getelementptr inbounds float* %tmp22851, i64 1
+  %tmp22853 = getelementptr inbounds float* %tmp22852, i64 1
+  %tmp22854 = getelementptr inbounds float* %tmp22853, i64 1
+  %tmp22855 = getelementptr inbounds float* %tmp22854, i64 1
+  %tmp22856 = getelementptr inbounds float* %tmp22855, i64 1
+  %tmp22857 = getelementptr inbounds float* %tmp22856, i64 1
+  %tmp22858 = getelementptr inbounds float* %tmp22857, i64 1
+  %tmp22859 = getelementptr inbounds float* %tmp22858, i64 1
+  %tmp22860 = getelementptr inbounds float* %tmp22859, i64 1
+  %tmp22861 = getelementptr inbounds float* %tmp22860, i64 1
+  %tmp22862 = getelementptr inbounds float* %tmp22861, i64 1
+  %tmp22863 = getelementptr inbounds float* %tmp22862, i64 1
+  %tmp22864 = getelementptr inbounds float* %tmp22863, i64 1
+  %tmp22865 = getelementptr inbounds float* %tmp22864, i64 1
+  %tmp22866 = getelementptr inbounds float* %tmp22865, i64 1
+  %tmp22867 = getelementptr inbounds float* %tmp22866, i64 1
+  %tmp22868 = getelementptr inbounds float* %tmp22867, i64 1
+  %tmp22869 = getelementptr inbounds float* %tmp22868, i64 1
+  %tmp22870 = getelementptr inbounds float* %tmp22869, i64 1
+  %tmp22871 = getelementptr inbounds float* %tmp22870, i64 1
+  %tmp22872 = getelementptr inbounds float* %tmp22871, i64 1
+  %tmp22873 = getelementptr inbounds float* %tmp22872, i64 1
+  %tmp22874 = getelementptr inbounds float* %tmp22873, i64 1
+  %tmp22875 = getelementptr inbounds float* %tmp22874, i64 1
+  %tmp22876 = getelementptr inbounds float* %tmp22875, i64 1
+  %tmp22877 = getelementptr inbounds float* %tmp22876, i64 1
+  %tmp22878 = getelementptr inbounds float* %tmp22877, i64 1
+  %tmp22879 = getelementptr inbounds float* %tmp22878, i64 1
+  %tmp22880 = getelementptr inbounds float* %tmp22879, i64 1
+  %tmp22881 = getelementptr inbounds float* %tmp22880, i64 1
+  %tmp22882 = getelementptr inbounds float* %tmp22881, i64 1
+  %tmp22883 = getelementptr inbounds float* %tmp22882, i64 1
+  %tmp22884 = getelementptr inbounds float* %tmp22883, i64 1
+  %tmp22885 = getelementptr inbounds float* %tmp22884, i64 1
+  %tmp22886 = getelementptr inbounds float* %tmp22885, i64 1
+  %tmp22887 = getelementptr inbounds float* %tmp22886, i64 1
+  %tmp22888 = getelementptr inbounds float* %tmp22887, i64 1
+  %tmp22889 = getelementptr inbounds float* %tmp22888, i64 1
+  %tmp22890 = getelementptr inbounds float* %tmp22889, i64 1
+  %tmp22891 = getelementptr inbounds float* %tmp22890, i64 1
+  %tmp22892 = getelementptr inbounds float* %tmp22891, i64 1
+  %tmp22893 = getelementptr inbounds float* %tmp22892, i64 1
+  %tmp22894 = getelementptr inbounds float* %tmp22893, i64 1
+  %tmp22895 = getelementptr inbounds float* %tmp22894, i64 1
+  %tmp22896 = getelementptr inbounds float* %tmp22895, i64 1
+  %tmp22897 = getelementptr inbounds float* %tmp22896, i64 1
+  %tmp22898 = getelementptr inbounds float* %tmp22897, i64 1
+  %tmp22899 = getelementptr inbounds float* %tmp22898, i64 1
+  %tmp22900 = getelementptr inbounds float* %tmp22899, i64 1
+  %tmp22901 = getelementptr inbounds float* %tmp22900, i64 1
+  %tmp22902 = getelementptr inbounds float* %tmp22901, i64 1
+  %tmp22903 = getelementptr inbounds float* %tmp22902, i64 1
+  %tmp22904 = getelementptr inbounds float* %tmp22903, i64 1
+  %tmp22905 = getelementptr inbounds float* %tmp22904, i64 1
+  %tmp22906 = getelementptr inbounds float* %tmp22905, i64 1
+  %tmp22907 = getelementptr inbounds float* %tmp22906, i64 1
+  %tmp22908 = getelementptr inbounds float* %tmp22907, i64 1
+  %tmp22909 = getelementptr inbounds float* %tmp22908, i64 1
+  %tmp22910 = getelementptr inbounds float* %tmp22909, i64 1
+  %tmp22911 = getelementptr inbounds float* %tmp22910, i64 1
+  %tmp22912 = getelementptr inbounds float* %tmp22911, i64 1
+  %tmp22913 = getelementptr inbounds float* %tmp22912, i64 1
+  %tmp22914 = getelementptr inbounds float* %tmp22913, i64 1
+  %tmp22915 = getelementptr inbounds float* %tmp22914, i64 1
+  %tmp22916 = getelementptr inbounds float* %tmp22915, i64 1
+  %tmp22917 = getelementptr inbounds float* %tmp22916, i64 1
+  %tmp22918 = getelementptr inbounds float* %tmp22917, i64 1
+  %tmp22919 = getelementptr inbounds float* %tmp22918, i64 1
+  %tmp22920 = getelementptr inbounds float* %tmp22919, i64 1
+  %tmp22921 = getelementptr inbounds float* %tmp22920, i64 1
+  %tmp22922 = getelementptr inbounds float* %tmp22921, i64 1
+  %tmp22923 = getelementptr inbounds float* %tmp22922, i64 1
+  %tmp22924 = getelementptr inbounds float* %tmp22923, i64 1
+  %tmp22925 = getelementptr inbounds float* %tmp22924, i64 1
+  %tmp22926 = getelementptr inbounds float* %tmp22925, i64 1
+  %tmp22927 = getelementptr inbounds float* %tmp22926, i64 1
+  %tmp22928 = getelementptr inbounds float* %tmp22927, i64 1
+  %tmp22929 = getelementptr inbounds float* %tmp22928, i64 1
+  %tmp22930 = getelementptr inbounds float* %tmp22929, i64 1
+  %tmp22931 = getelementptr inbounds float* %tmp22930, i64 1
+  %tmp22932 = getelementptr inbounds float* %tmp22931, i64 1
+  %tmp22933 = getelementptr inbounds float* %tmp22932, i64 1
+  %tmp22934 = getelementptr inbounds float* %tmp22933, i64 1
+  %tmp22935 = getelementptr inbounds float* %tmp22934, i64 1
+  %tmp22936 = getelementptr inbounds float* %tmp22935, i64 1
+  %tmp22937 = getelementptr inbounds float* %tmp22936, i64 1
+  %tmp22938 = getelementptr inbounds float* %tmp22937, i64 1
+  %tmp22939 = getelementptr inbounds float* %tmp22938, i64 1
+  %tmp22940 = getelementptr inbounds float* %tmp22939, i64 1
+  %tmp22941 = getelementptr inbounds float* %tmp22940, i64 1
+  %tmp22942 = getelementptr inbounds float* %tmp22941, i64 1
+  %tmp22943 = getelementptr inbounds float* %tmp22942, i64 1
+  %tmp22944 = getelementptr inbounds float* %tmp22943, i64 1
+  %tmp22945 = getelementptr inbounds float* %tmp22944, i64 1
+  %tmp22946 = getelementptr inbounds float* %tmp22945, i64 1
+  %tmp22947 = getelementptr inbounds float* %tmp22946, i64 1
+  %tmp22948 = getelementptr inbounds float* %tmp22947, i64 1
+  %tmp22949 = getelementptr inbounds float* %tmp22948, i64 1
+  %tmp22950 = getelementptr inbounds float* %tmp22949, i64 1
+  %tmp22951 = getelementptr inbounds float* %tmp22950, i64 1
+  %tmp22952 = getelementptr inbounds float* %tmp22951, i64 1
+  %tmp22953 = getelementptr inbounds float* %tmp22952, i64 1
+  %tmp22954 = getelementptr inbounds float* %tmp22953, i64 1
+  %tmp22955 = getelementptr inbounds float* %tmp22954, i64 1
+  %tmp22956 = getelementptr inbounds float* %tmp22955, i64 1
+  %tmp22957 = getelementptr inbounds float* %tmp22956, i64 1
+  %tmp22958 = getelementptr inbounds float* %tmp22957, i64 1
+  %tmp22959 = getelementptr inbounds float* %tmp22958, i64 1
+  %tmp22960 = getelementptr inbounds float* %tmp22959, i64 1
+  %tmp22961 = getelementptr inbounds float* %tmp22960, i64 1
+  %tmp22962 = getelementptr inbounds float* %tmp22961, i64 1
+  %tmp22963 = getelementptr inbounds float* %tmp22962, i64 1
+  %tmp22964 = getelementptr inbounds float* %tmp22963, i64 1
+  %tmp22965 = getelementptr inbounds float* %tmp22964, i64 1
+  %tmp22966 = getelementptr inbounds float* %tmp22965, i64 1
+  %tmp22967 = getelementptr inbounds float* %tmp22966, i64 1
+  %tmp22968 = getelementptr inbounds float* %tmp22967, i64 1
+  %tmp22969 = getelementptr inbounds float* %tmp22968, i64 1
+  %tmp22970 = getelementptr inbounds float* %tmp22969, i64 1
+  %tmp22971 = getelementptr inbounds float* %tmp22970, i64 1
+  %tmp22972 = getelementptr inbounds float* %tmp22971, i64 1
+  %tmp22973 = getelementptr inbounds float* %tmp22972, i64 1
+  %tmp22974 = getelementptr inbounds float* %tmp22973, i64 1
+  %tmp22975 = getelementptr inbounds float* %tmp22974, i64 1
+  %tmp22976 = getelementptr inbounds float* %tmp22975, i64 1
+  %tmp22977 = getelementptr inbounds float* %tmp22976, i64 1
+  %tmp22978 = getelementptr inbounds float* %tmp22977, i64 1
+  %tmp22979 = getelementptr inbounds float* %tmp22978, i64 1
+  %tmp22980 = getelementptr inbounds float* %tmp22979, i64 1
+  %tmp22981 = getelementptr inbounds float* %tmp22980, i64 1
+  %tmp22982 = getelementptr inbounds float* %tmp22981, i64 1
+  %tmp22983 = getelementptr inbounds float* %tmp22982, i64 1
+  %tmp22984 = getelementptr inbounds float* %tmp22983, i64 1
+  %tmp22985 = getelementptr inbounds float* %tmp22984, i64 1
+  %tmp22986 = getelementptr inbounds float* %tmp22985, i64 1
+  %tmp22987 = getelementptr inbounds float* %tmp22986, i64 1
+  %tmp22988 = getelementptr inbounds float* %tmp22987, i64 1
+  %tmp22989 = getelementptr inbounds float* %tmp22988, i64 1
+  %tmp22990 = getelementptr inbounds float* %tmp22989, i64 1
+  %tmp22991 = getelementptr inbounds float* %tmp22990, i64 1
+  %tmp22992 = getelementptr inbounds float* %tmp22991, i64 1
+  %tmp22993 = getelementptr inbounds float* %tmp22992, i64 1
+  %tmp22994 = getelementptr inbounds float* %tmp22993, i64 1
+  %tmp22995 = getelementptr inbounds float* %tmp22994, i64 1
+  %tmp22996 = getelementptr inbounds float* %tmp22995, i64 1
+  %tmp22997 = getelementptr inbounds float* %tmp22996, i64 1
+  %tmp22998 = getelementptr inbounds float* %tmp22997, i64 1
+  %tmp22999 = getelementptr inbounds float* %tmp22998, i64 1
+  %tmp23000 = getelementptr inbounds float* %tmp22999, i64 1
+  %tmp23001 = getelementptr inbounds float* %tmp23000, i64 1
+  %tmp23002 = getelementptr inbounds float* %tmp23001, i64 1
+  %tmp23003 = getelementptr inbounds float* %tmp23002, i64 1
+  %tmp23004 = getelementptr inbounds float* %tmp23003, i64 1
+  %tmp23005 = getelementptr inbounds float* %tmp23004, i64 1
+  %tmp23006 = getelementptr inbounds float* %tmp23005, i64 1
+  %tmp23007 = getelementptr inbounds float* %tmp23006, i64 1
+  %tmp23008 = getelementptr inbounds float* %tmp23007, i64 1
+  %tmp23009 = getelementptr inbounds float* %tmp23008, i64 1
+  %tmp23010 = getelementptr inbounds float* %tmp23009, i64 1
+  %tmp23011 = getelementptr inbounds float* %tmp23010, i64 1
+  %tmp23012 = getelementptr inbounds float* %tmp23011, i64 1
+  %tmp23013 = getelementptr inbounds float* %tmp23012, i64 1
+  %tmp23014 = getelementptr inbounds float* %tmp23013, i64 1
+  %tmp23015 = getelementptr inbounds float* %tmp23014, i64 1
+  %tmp23016 = getelementptr inbounds float* %tmp23015, i64 1
+  %tmp23017 = getelementptr inbounds float* %tmp23016, i64 1
+  %tmp23018 = getelementptr inbounds float* %tmp23017, i64 1
+  %tmp23019 = getelementptr inbounds float* %tmp23018, i64 1
+  %tmp23020 = getelementptr inbounds float* %tmp23019, i64 1
+  %tmp23021 = getelementptr inbounds float* %tmp23020, i64 1
+  %tmp23022 = getelementptr inbounds float* %tmp23021, i64 1
+  %tmp23023 = getelementptr inbounds float* %tmp23022, i64 1
+  %tmp23024 = getelementptr inbounds float* %tmp23023, i64 1
+  %tmp23025 = getelementptr inbounds float* %tmp23024, i64 1
+  %tmp23026 = getelementptr inbounds float* %tmp23025, i64 1
+  %tmp23027 = getelementptr inbounds float* %tmp23026, i64 1
+  %tmp23028 = getelementptr inbounds float* %tmp23027, i64 1
+  %tmp23029 = getelementptr inbounds float* %tmp23028, i64 1
+  %tmp23030 = getelementptr inbounds float* %tmp23029, i64 1
+  %tmp23031 = getelementptr inbounds float* %tmp23030, i64 1
+  %tmp23032 = getelementptr inbounds float* %tmp23031, i64 1
+  %tmp23033 = getelementptr inbounds float* %tmp23032, i64 1
+  %tmp23034 = getelementptr inbounds float* %tmp23033, i64 1
+  %tmp23035 = getelementptr inbounds float* %tmp23034, i64 1
+  %tmp23036 = getelementptr inbounds float* %tmp23035, i64 1
+  %tmp23037 = getelementptr inbounds float* %tmp23036, i64 1
+  %tmp23038 = getelementptr inbounds float* %tmp23037, i64 1
+  %tmp23039 = getelementptr inbounds float* %tmp23038, i64 1
+  %tmp23040 = getelementptr inbounds float* %tmp23039, i64 1
+  %tmp23041 = getelementptr inbounds float* %tmp23040, i64 1
+  %tmp23042 = getelementptr inbounds float* %tmp23041, i64 1
+  %tmp23043 = getelementptr inbounds float* %tmp23042, i64 1
+  %tmp23044 = getelementptr inbounds float* %tmp23043, i64 1
+  %tmp23045 = getelementptr inbounds float* %tmp23044, i64 1
+  %tmp23046 = getelementptr inbounds float* %tmp23045, i64 1
+  %tmp23047 = getelementptr inbounds float* %tmp23046, i64 1
+  %tmp23048 = getelementptr inbounds float* %tmp23047, i64 1
+  %tmp23049 = getelementptr inbounds float* %tmp23048, i64 1
+  %tmp23050 = getelementptr inbounds float* %tmp23049, i64 1
+  %tmp23051 = getelementptr inbounds float* %tmp23050, i64 1
+  %tmp23052 = getelementptr inbounds float* %tmp23051, i64 1
+  %tmp23053 = getelementptr inbounds float* %tmp23052, i64 1
+  %tmp23054 = getelementptr inbounds float* %tmp23053, i64 1
+  %tmp23055 = getelementptr inbounds float* %tmp23054, i64 1
+  %tmp23056 = getelementptr inbounds float* %tmp23055, i64 1
+  %tmp23057 = getelementptr inbounds float* %tmp23056, i64 1
+  %tmp23058 = getelementptr inbounds float* %tmp23057, i64 1
+  %tmp23059 = getelementptr inbounds float* %tmp23058, i64 1
+  %tmp23060 = getelementptr inbounds float* %tmp23059, i64 1
+  %tmp23061 = getelementptr inbounds float* %tmp23060, i64 1
+  %tmp23062 = getelementptr inbounds float* %tmp23061, i64 1
+  %tmp23063 = getelementptr inbounds float* %tmp23062, i64 1
+  %tmp23064 = getelementptr inbounds float* %tmp23063, i64 1
+  %tmp23065 = getelementptr inbounds float* %tmp23064, i64 1
+  %tmp23066 = getelementptr inbounds float* %tmp23065, i64 1
+  %tmp23067 = getelementptr inbounds float* %tmp23066, i64 1
+  %tmp23068 = getelementptr inbounds float* %tmp23067, i64 1
+  %tmp23069 = getelementptr inbounds float* %tmp23068, i64 1
+  %tmp23070 = getelementptr inbounds float* %tmp23069, i64 1
+  %tmp23071 = getelementptr inbounds float* %tmp23070, i64 1
+  %tmp23072 = getelementptr inbounds float* %tmp23071, i64 1
+  %tmp23073 = getelementptr inbounds float* %tmp23072, i64 1
+  %tmp23074 = getelementptr inbounds float* %tmp23073, i64 1
+  %tmp23075 = getelementptr inbounds float* %tmp23074, i64 1
+  %tmp23076 = getelementptr inbounds float* %tmp23075, i64 1
+  %tmp23077 = getelementptr inbounds float* %tmp23076, i64 1
+  %tmp23078 = getelementptr inbounds float* %tmp23077, i64 1
+  %tmp23079 = getelementptr inbounds float* %tmp23078, i64 1
+  %tmp23080 = getelementptr inbounds float* %tmp23079, i64 1
+  %tmp23081 = getelementptr inbounds float* %tmp23080, i64 1
+  %tmp23082 = getelementptr inbounds float* %tmp23081, i64 1
+  %tmp23083 = getelementptr inbounds float* %tmp23082, i64 1
+  %tmp23084 = getelementptr inbounds float* %tmp23083, i64 1
+  %tmp23085 = getelementptr inbounds float* %tmp23084, i64 1
+  %tmp23086 = getelementptr inbounds float* %tmp23085, i64 1
+  %tmp23087 = getelementptr inbounds float* %tmp23086, i64 1
+  %tmp23088 = getelementptr inbounds float* %tmp23087, i64 1
+  %tmp23089 = getelementptr inbounds float* %tmp23088, i64 1
+  %tmp23090 = getelementptr inbounds float* %tmp23089, i64 1
+  %tmp23091 = getelementptr inbounds float* %tmp23090, i64 1
+  %tmp23092 = getelementptr inbounds float* %tmp23091, i64 1
+  %tmp23093 = getelementptr inbounds float* %tmp23092, i64 1
+  %tmp23094 = getelementptr inbounds float* %tmp23093, i64 1
+  %tmp23095 = getelementptr inbounds float* %tmp23094, i64 1
+  %tmp23096 = getelementptr inbounds float* %tmp23095, i64 1
+  %tmp23097 = getelementptr inbounds float* %tmp23096, i64 1
+  %tmp23098 = getelementptr inbounds float* %tmp23097, i64 1
+  %tmp23099 = getelementptr inbounds float* %tmp23098, i64 1
+  %tmp23100 = getelementptr inbounds float* %tmp23099, i64 1
+  %tmp23101 = getelementptr inbounds float* %tmp23100, i64 1
+  %tmp23102 = getelementptr inbounds float* %tmp23101, i64 1
+  %tmp23103 = getelementptr inbounds float* %tmp23102, i64 1
+  %tmp23104 = getelementptr inbounds float* %tmp23103, i64 1
+  %tmp23105 = getelementptr inbounds float* %tmp23104, i64 1
+  %tmp23106 = getelementptr inbounds float* %tmp23105, i64 1
+  %tmp23107 = getelementptr inbounds float* %tmp23106, i64 1
+  %tmp23108 = getelementptr inbounds float* %tmp23107, i64 1
+  %tmp23109 = getelementptr inbounds float* %tmp23108, i64 1
+  %tmp23110 = getelementptr inbounds float* %tmp23109, i64 1
+  %tmp23111 = getelementptr inbounds float* %tmp23110, i64 1
+  %tmp23112 = getelementptr inbounds float* %tmp23111, i64 1
+  %tmp23113 = getelementptr inbounds float* %tmp23112, i64 1
+  %tmp23114 = getelementptr inbounds float* %tmp23113, i64 1
+  %tmp23115 = getelementptr inbounds float* %tmp23114, i64 1
+  %tmp23116 = getelementptr inbounds float* %tmp23115, i64 1
+  %tmp23117 = getelementptr inbounds float* %tmp23116, i64 1
+  %tmp23118 = getelementptr inbounds float* %tmp23117, i64 1
+  %tmp23119 = getelementptr inbounds float* %tmp23118, i64 1
+  %tmp23120 = getelementptr inbounds float* %tmp23119, i64 1
+  %tmp23121 = getelementptr inbounds float* %tmp23120, i64 1
+  %tmp23122 = getelementptr inbounds float* %tmp23121, i64 1
+  %tmp23123 = getelementptr inbounds float* %tmp23122, i64 1
+  %tmp23124 = getelementptr inbounds float* %tmp23123, i64 1
+  %tmp23125 = getelementptr inbounds float* %tmp23124, i64 1
+  %tmp23126 = getelementptr inbounds float* %tmp23125, i64 1
+  %tmp23127 = getelementptr inbounds float* %tmp23126, i64 1
+  %tmp23128 = getelementptr inbounds float* %tmp23127, i64 1
+  %tmp23129 = getelementptr inbounds float* %tmp23128, i64 1
+  %tmp23130 = getelementptr inbounds float* %tmp23129, i64 1
+  %tmp23131 = getelementptr inbounds float* %tmp23130, i64 1
+  %tmp23132 = getelementptr inbounds float* %tmp23131, i64 1
+  %tmp23133 = getelementptr inbounds float* %tmp23132, i64 1
+  %tmp23134 = getelementptr inbounds float* %tmp23133, i64 1
+  %tmp23135 = getelementptr inbounds float* %tmp23134, i64 1
+  %tmp23136 = getelementptr inbounds float* %tmp23135, i64 1
+  %tmp23137 = getelementptr inbounds float* %tmp23136, i64 1
+  %tmp23138 = getelementptr inbounds float* %tmp23137, i64 1
+  %tmp23139 = getelementptr inbounds float* %tmp23138, i64 1
+  %tmp23140 = getelementptr inbounds float* %tmp23139, i64 1
+  %tmp23141 = getelementptr inbounds float* %tmp23140, i64 1
+  %tmp23142 = getelementptr inbounds float* %tmp23141, i64 1
+  %tmp23143 = getelementptr inbounds float* %tmp23142, i64 1
+  %tmp23144 = getelementptr inbounds float* %tmp23143, i64 1
+  %tmp23145 = getelementptr inbounds float* %tmp23144, i64 1
+  %tmp23146 = getelementptr inbounds float* %tmp23145, i64 1
+  %tmp23147 = getelementptr inbounds float* %tmp23146, i64 1
+  %tmp23148 = getelementptr inbounds float* %tmp23147, i64 1
+  %tmp23149 = getelementptr inbounds float* %tmp23148, i64 1
+  %tmp23150 = getelementptr inbounds float* %tmp23149, i64 1
+  %tmp23151 = getelementptr inbounds float* %tmp23150, i64 1
+  %tmp23152 = getelementptr inbounds float* %tmp23151, i64 1
+  %tmp23153 = getelementptr inbounds float* %tmp23152, i64 1
+  %tmp23154 = getelementptr inbounds float* %tmp23153, i64 1
+  %tmp23155 = getelementptr inbounds float* %tmp23154, i64 1
+  %tmp23156 = getelementptr inbounds float* %tmp23155, i64 1
+  %tmp23157 = getelementptr inbounds float* %tmp23156, i64 1
+  %tmp23158 = getelementptr inbounds float* %tmp23157, i64 1
+  %tmp23159 = getelementptr inbounds float* %tmp23158, i64 1
+  %tmp23160 = getelementptr inbounds float* %tmp23159, i64 1
+  %tmp23161 = getelementptr inbounds float* %tmp23160, i64 1
+  %tmp23162 = getelementptr inbounds float* %tmp23161, i64 1
+  %tmp23163 = getelementptr inbounds float* %tmp23162, i64 1
+  %tmp23164 = getelementptr inbounds float* %tmp23163, i64 1
+  %tmp23165 = getelementptr inbounds float* %tmp23164, i64 1
+  %tmp23166 = getelementptr inbounds float* %tmp23165, i64 1
+  %tmp23167 = getelementptr inbounds float* %tmp23166, i64 1
+  %tmp23168 = getelementptr inbounds float* %tmp23167, i64 1
+  %tmp23169 = getelementptr inbounds float* %tmp23168, i64 1
+  %tmp23170 = getelementptr inbounds float* %tmp23169, i64 1
+  %tmp23171 = getelementptr inbounds float* %tmp23170, i64 1
+  %tmp23172 = getelementptr inbounds float* %tmp23171, i64 1
+  %tmp23173 = getelementptr inbounds float* %tmp23172, i64 1
+  %tmp23174 = getelementptr inbounds float* %tmp23173, i64 1
+  %tmp23175 = getelementptr inbounds float* %tmp23174, i64 1
+  %tmp23176 = getelementptr inbounds float* %tmp23175, i64 1
+  %tmp23177 = getelementptr inbounds float* %tmp23176, i64 1
+  %tmp23178 = getelementptr inbounds float* %tmp23177, i64 1
+  %tmp23179 = getelementptr inbounds float* %tmp23178, i64 1
+  %tmp23180 = getelementptr inbounds float* %tmp23179, i64 1
+  %tmp23181 = getelementptr inbounds float* %tmp23180, i64 1
+  %tmp23182 = getelementptr inbounds float* %tmp23181, i64 1
+  %tmp23183 = getelementptr inbounds float* %tmp23182, i64 1
+  %tmp23184 = getelementptr inbounds float* %tmp23183, i64 1
+  %tmp23185 = getelementptr inbounds float* %tmp23184, i64 1
+  %tmp23186 = getelementptr inbounds float* %tmp23185, i64 1
+  %tmp23187 = getelementptr inbounds float* %tmp23186, i64 1
+  %tmp23188 = getelementptr inbounds float* %tmp23187, i64 1
+  %tmp23189 = getelementptr inbounds float* %tmp23188, i64 1
+  %tmp23190 = getelementptr inbounds float* %tmp23189, i64 1
+  %tmp23191 = getelementptr inbounds float* %tmp23190, i64 1
+  %tmp23192 = getelementptr inbounds float* %tmp23191, i64 1
+  %tmp23193 = getelementptr inbounds float* %tmp23192, i64 1
+  %tmp23194 = getelementptr inbounds float* %tmp23193, i64 1
+  %tmp23195 = getelementptr inbounds float* %tmp23194, i64 1
+  %tmp23196 = getelementptr inbounds float* %tmp23195, i64 1
+  %tmp23197 = getelementptr inbounds float* %tmp23196, i64 1
+  %tmp23198 = getelementptr inbounds float* %tmp23197, i64 1
+  %tmp23199 = getelementptr inbounds float* %tmp23198, i64 1
+  %tmp23200 = getelementptr inbounds float* %tmp23199, i64 1
+  %tmp23201 = getelementptr inbounds float* %tmp23200, i64 1
+  %tmp23202 = getelementptr inbounds float* %tmp23201, i64 1
+  %tmp23203 = getelementptr inbounds float* %tmp23202, i64 1
+  %tmp23204 = getelementptr inbounds float* %tmp23203, i64 1
+  %tmp23205 = getelementptr inbounds float* %tmp23204, i64 1
+  %tmp23206 = getelementptr inbounds float* %tmp23205, i64 1
+  %tmp23207 = getelementptr inbounds float* %tmp23206, i64 1
+  %tmp23208 = getelementptr inbounds float* %tmp23207, i64 1
+  %tmp23209 = getelementptr inbounds float* %tmp23208, i64 1
+  %tmp23210 = getelementptr inbounds float* %tmp23209, i64 1
+  %tmp23211 = getelementptr inbounds float* %tmp23210, i64 1
+  %tmp23212 = getelementptr inbounds float* %tmp23211, i64 1
+  %tmp23213 = getelementptr inbounds float* %tmp23212, i64 1
+  %tmp23214 = getelementptr inbounds float* %tmp23213, i64 1
+  %tmp23215 = getelementptr inbounds float* %tmp23214, i64 1
+  %tmp23216 = getelementptr inbounds float* %tmp23215, i64 1
+  %tmp23217 = getelementptr inbounds float* %tmp23216, i64 1
+  %tmp23218 = getelementptr inbounds float* %tmp23217, i64 1
+  %tmp23219 = getelementptr inbounds float* %tmp23218, i64 1
+  %tmp23220 = getelementptr inbounds float* %tmp23219, i64 1
+  %tmp23221 = getelementptr inbounds float* %tmp23220, i64 1
+  %tmp23222 = getelementptr inbounds float* %tmp23221, i64 1
+  %tmp23223 = getelementptr inbounds float* %tmp23222, i64 1
+  %tmp23224 = getelementptr inbounds float* %tmp23223, i64 1
+  %tmp23225 = getelementptr inbounds float* %tmp23224, i64 1
+  %tmp23226 = getelementptr inbounds float* %tmp23225, i64 1
+  %tmp23227 = getelementptr inbounds float* %tmp23226, i64 1
+  %tmp23228 = getelementptr inbounds float* %tmp23227, i64 1
+  %tmp23229 = getelementptr inbounds float* %tmp23228, i64 1
+  %tmp23230 = getelementptr inbounds float* %tmp23229, i64 1
+  %tmp23231 = getelementptr inbounds float* %tmp23230, i64 1
+  %tmp23232 = getelementptr inbounds float* %tmp23231, i64 1
+  %tmp23233 = getelementptr inbounds float* %tmp23232, i64 1
+  %tmp23234 = getelementptr inbounds float* %tmp23233, i64 1
+  %tmp23235 = getelementptr inbounds float* %tmp23234, i64 1
+  %tmp23236 = getelementptr inbounds float* %tmp23235, i64 1
+  %tmp23237 = getelementptr inbounds float* %tmp23236, i64 1
+  %tmp23238 = getelementptr inbounds float* %tmp23237, i64 1
+  %tmp23239 = getelementptr inbounds float* %tmp23238, i64 1
+  %tmp23240 = getelementptr inbounds float* %tmp23239, i64 1
+  %tmp23241 = getelementptr inbounds float* %tmp23240, i64 1
+  %tmp23242 = getelementptr inbounds float* %tmp23241, i64 1
+  %tmp23243 = getelementptr inbounds float* %tmp23242, i64 1
+  %tmp23244 = getelementptr inbounds float* %tmp23243, i64 1
+  %tmp23245 = getelementptr inbounds float* %tmp23244, i64 1
+  %tmp23246 = getelementptr inbounds float* %tmp23245, i64 1
+  %tmp23247 = getelementptr inbounds float* %tmp23246, i64 1
+  %tmp23248 = getelementptr inbounds float* %tmp23247, i64 1
+  %tmp23249 = getelementptr inbounds float* %tmp23248, i64 1
+  %tmp23250 = getelementptr inbounds float* %tmp23249, i64 1
+  %tmp23251 = getelementptr inbounds float* %tmp23250, i64 1
+  %tmp23252 = getelementptr inbounds float* %tmp23251, i64 1
+  %tmp23253 = getelementptr inbounds float* %tmp23252, i64 1
+  %tmp23254 = getelementptr inbounds float* %tmp23253, i64 1
+  %tmp23255 = getelementptr inbounds float* %tmp23254, i64 1
+  %tmp23256 = getelementptr inbounds float* %tmp23255, i64 1
+  %tmp23257 = getelementptr inbounds float* %tmp23256, i64 1
+  %tmp23258 = getelementptr inbounds float* %tmp23257, i64 1
+  %tmp23259 = getelementptr inbounds float* %tmp23258, i64 1
+  %tmp23260 = getelementptr inbounds float* %tmp23259, i64 1
+  %tmp23261 = getelementptr inbounds float* %tmp23260, i64 1
+  %tmp23262 = getelementptr inbounds float* %tmp23261, i64 1
+  %tmp23263 = getelementptr inbounds float* %tmp23262, i64 1
+  %tmp23264 = getelementptr inbounds float* %tmp23263, i64 1
+  %tmp23265 = getelementptr inbounds float* %tmp23264, i64 1
+  %tmp23266 = getelementptr inbounds float* %tmp23265, i64 1
+  %tmp23267 = getelementptr inbounds float* %tmp23266, i64 1
+  %tmp23268 = getelementptr inbounds float* %tmp23267, i64 1
+  %tmp23269 = getelementptr inbounds float* %tmp23268, i64 1
+  %tmp23270 = getelementptr inbounds float* %tmp23269, i64 1
+  %tmp23271 = getelementptr inbounds float* %tmp23270, i64 1
+  %tmp23272 = getelementptr inbounds float* %tmp23271, i64 1
+  %tmp23273 = getelementptr inbounds float* %tmp23272, i64 1
+  %tmp23274 = getelementptr inbounds float* %tmp23273, i64 1
+  %tmp23275 = getelementptr inbounds float* %tmp23274, i64 1
+  %tmp23276 = getelementptr inbounds float* %tmp23275, i64 1
+  %tmp23277 = getelementptr inbounds float* %tmp23276, i64 1
+  %tmp23278 = getelementptr inbounds float* %tmp23277, i64 1
+  %tmp23279 = getelementptr inbounds float* %tmp23278, i64 1
+  %tmp23280 = getelementptr inbounds float* %tmp23279, i64 1
+  %tmp23281 = getelementptr inbounds float* %tmp23280, i64 1
+  %tmp23282 = getelementptr inbounds float* %tmp23281, i64 1
+  %tmp23283 = getelementptr inbounds float* %tmp23282, i64 1
+  %tmp23284 = getelementptr inbounds float* %tmp23283, i64 1
+  %tmp23285 = getelementptr inbounds float* %tmp23284, i64 1
+  %tmp23286 = getelementptr inbounds float* %tmp23285, i64 1
+  %tmp23287 = getelementptr inbounds float* %tmp23286, i64 1
+  %tmp23288 = getelementptr inbounds float* %tmp23287, i64 1
+  %tmp23289 = getelementptr inbounds float* %tmp23288, i64 1
+  %tmp23290 = getelementptr inbounds float* %tmp23289, i64 1
+  %tmp23291 = getelementptr inbounds float* %tmp23290, i64 1
+  %tmp23292 = getelementptr inbounds float* %tmp23291, i64 1
+  %tmp23293 = getelementptr inbounds float* %tmp23292, i64 1
+  %tmp23294 = getelementptr inbounds float* %tmp23293, i64 1
+  %tmp23295 = getelementptr inbounds float* %tmp23294, i64 1
+  %tmp23296 = getelementptr inbounds float* %tmp23295, i64 1
+  %tmp23297 = getelementptr inbounds float* %tmp23296, i64 1
+  %tmp23298 = getelementptr inbounds float* %tmp23297, i64 1
+  %tmp23299 = getelementptr inbounds float* %tmp23298, i64 1
+  %tmp23300 = getelementptr inbounds float* %tmp23299, i64 1
+  %tmp23301 = getelementptr inbounds float* %tmp23300, i64 1
+  %tmp23302 = getelementptr inbounds float* %tmp23301, i64 1
+  %tmp23303 = getelementptr inbounds float* %tmp23302, i64 1
+  %tmp23304 = getelementptr inbounds float* %tmp23303, i64 1
+  %tmp23305 = getelementptr inbounds float* %tmp23304, i64 1
+  %tmp23306 = getelementptr inbounds float* %tmp23305, i64 1
+  %tmp23307 = getelementptr inbounds float* %tmp23306, i64 1
+  %tmp23308 = getelementptr inbounds float* %tmp23307, i64 1
+  %tmp23309 = getelementptr inbounds float* %tmp23308, i64 1
+  %tmp23310 = getelementptr inbounds float* %tmp23309, i64 1
+  %tmp23311 = getelementptr inbounds float* %tmp23310, i64 1
+  %tmp23312 = getelementptr inbounds float* %tmp23311, i64 1
+  %tmp23313 = getelementptr inbounds float* %tmp23312, i64 1
+  %tmp23314 = getelementptr inbounds float* %tmp23313, i64 1
+  %tmp23315 = getelementptr inbounds float* %tmp23314, i64 1
+  %tmp23316 = getelementptr inbounds float* %tmp23315, i64 1
+  %tmp23317 = getelementptr inbounds float* %tmp23316, i64 1
+  %tmp23318 = getelementptr inbounds float* %tmp23317, i64 1
+  %tmp23319 = getelementptr inbounds float* %tmp23318, i64 1
+  %tmp23320 = getelementptr inbounds float* %tmp23319, i64 1
+  %tmp23321 = getelementptr inbounds float* %tmp23320, i64 1
+  %tmp23322 = getelementptr inbounds float* %tmp23321, i64 1
+  %tmp23323 = getelementptr inbounds float* %tmp23322, i64 1
+  %tmp23324 = getelementptr inbounds float* %tmp23323, i64 1
+  %tmp23325 = getelementptr inbounds float* %tmp23324, i64 1
+  %tmp23326 = getelementptr inbounds float* %tmp23325, i64 1
+  %tmp23327 = getelementptr inbounds float* %tmp23326, i64 1
+  %tmp23328 = getelementptr inbounds float* %tmp23327, i64 1
+  %tmp23329 = getelementptr inbounds float* %tmp23328, i64 1
+  %tmp23330 = getelementptr inbounds float* %tmp23329, i64 1
+  %tmp23331 = getelementptr inbounds float* %tmp23330, i64 1
+  %tmp23332 = getelementptr inbounds float* %tmp23331, i64 1
+  %tmp23333 = getelementptr inbounds float* %tmp23332, i64 1
+  %tmp23334 = getelementptr inbounds float* %tmp23333, i64 1
+  %tmp23335 = getelementptr inbounds float* %tmp23334, i64 1
+  %tmp23336 = getelementptr inbounds float* %tmp23335, i64 1
+  %tmp23337 = getelementptr inbounds float* %tmp23336, i64 1
+  %tmp23338 = getelementptr inbounds float* %tmp23337, i64 1
+  %tmp23339 = getelementptr inbounds float* %tmp23338, i64 1
+  %tmp23340 = getelementptr inbounds float* %tmp23339, i64 1
+  %tmp23341 = getelementptr inbounds float* %tmp23340, i64 1
+  %tmp23342 = getelementptr inbounds float* %tmp23341, i64 1
+  %tmp23343 = getelementptr inbounds float* %tmp23342, i64 1
+  %tmp23344 = getelementptr inbounds float* %tmp23343, i64 1
+  %tmp23345 = getelementptr inbounds float* %tmp23344, i64 1
+  %tmp23346 = getelementptr inbounds float* %tmp23345, i64 1
+  %tmp23347 = getelementptr inbounds float* %tmp23346, i64 1
+  %tmp23348 = getelementptr inbounds float* %tmp23347, i64 1
+  %tmp23349 = getelementptr inbounds float* %tmp23348, i64 1
+  %tmp23350 = getelementptr inbounds float* %tmp23349, i64 1
+  %tmp23351 = getelementptr inbounds float* %tmp23350, i64 1
+  %tmp23352 = getelementptr inbounds float* %tmp23351, i64 1
+  %tmp23353 = getelementptr inbounds float* %tmp23352, i64 1
+  %tmp23354 = getelementptr inbounds float* %tmp23353, i64 1
+  %tmp23355 = getelementptr inbounds float* %tmp23354, i64 1
+  %tmp23356 = getelementptr inbounds float* %tmp23355, i64 1
+  %tmp23357 = getelementptr inbounds float* %tmp23356, i64 1
+  %tmp23358 = getelementptr inbounds float* %tmp23357, i64 1
+  %tmp23359 = getelementptr inbounds float* %tmp23358, i64 1
+  %tmp23360 = getelementptr inbounds float* %tmp23359, i64 1
+  %tmp23361 = getelementptr inbounds float* %tmp23360, i64 1
+  %tmp23362 = getelementptr inbounds float* %tmp23361, i64 1
+  %tmp23363 = getelementptr inbounds float* %tmp23362, i64 1
+  %tmp23364 = getelementptr inbounds float* %tmp23363, i64 1
+  %tmp23365 = getelementptr inbounds float* %tmp23364, i64 1
+  %tmp23366 = getelementptr inbounds float* %tmp23365, i64 1
+  %tmp23367 = getelementptr inbounds float* %tmp23366, i64 1
+  %tmp23368 = getelementptr inbounds float* %tmp23367, i64 1
+  %tmp23369 = getelementptr inbounds float* %tmp23368, i64 1
+  %tmp23370 = getelementptr inbounds float* %tmp23369, i64 1
+  %tmp23371 = getelementptr inbounds float* %tmp23370, i64 1
+  %tmp23372 = getelementptr inbounds float* %tmp23371, i64 1
+  %tmp23373 = getelementptr inbounds float* %tmp23372, i64 1
+  %tmp23374 = getelementptr inbounds float* %tmp23373, i64 1
+  %tmp23375 = getelementptr inbounds float* %tmp23374, i64 1
+  %tmp23376 = getelementptr inbounds float* %tmp23375, i64 1
+  %tmp23377 = getelementptr inbounds float* %tmp23376, i64 1
+  %tmp23378 = getelementptr inbounds float* %tmp23377, i64 1
+  %tmp23379 = getelementptr inbounds float* %tmp23378, i64 1
+  %tmp23380 = getelementptr inbounds float* %tmp23379, i64 1
+  %tmp23381 = getelementptr inbounds float* %tmp23380, i64 1
+  %tmp23382 = getelementptr inbounds float* %tmp23381, i64 1
+  %tmp23383 = getelementptr inbounds float* %tmp23382, i64 1
+  %tmp23384 = getelementptr inbounds float* %tmp23383, i64 1
+  %tmp23385 = getelementptr inbounds float* %tmp23384, i64 1
+  %tmp23386 = getelementptr inbounds float* %tmp23385, i64 1
+  %tmp23387 = getelementptr inbounds float* %tmp23386, i64 1
+  %tmp23388 = getelementptr inbounds float* %tmp23387, i64 1
+  %tmp23389 = getelementptr inbounds float* %tmp23388, i64 1
+  %tmp23390 = getelementptr inbounds float* %tmp23389, i64 1
+  %tmp23391 = getelementptr inbounds float* %tmp23390, i64 1
+  %tmp23392 = getelementptr inbounds float* %tmp23391, i64 1
+  %tmp23393 = getelementptr inbounds float* %tmp23392, i64 1
+  %tmp23394 = getelementptr inbounds float* %tmp23393, i64 1
+  %tmp23395 = getelementptr inbounds float* %tmp23394, i64 1
+  %tmp23396 = getelementptr inbounds float* %tmp23395, i64 1
+  %tmp23397 = getelementptr inbounds float* %tmp23396, i64 1
+  %tmp23398 = getelementptr inbounds float* %tmp23397, i64 1
+  %tmp23399 = getelementptr inbounds float* %tmp23398, i64 1
+  %tmp23400 = getelementptr inbounds float* %tmp23399, i64 1
+  %tmp23401 = getelementptr inbounds float* %tmp23400, i64 1
+  %tmp23402 = getelementptr inbounds float* %tmp23401, i64 1
+  %tmp23403 = getelementptr inbounds float* %tmp23402, i64 1
+  %tmp23404 = getelementptr inbounds float* %tmp23403, i64 1
+  %tmp23405 = getelementptr inbounds float* %tmp23404, i64 1
+  %tmp23406 = getelementptr inbounds float* %tmp23405, i64 1
+  %tmp23407 = getelementptr inbounds float* %tmp23406, i64 1
+  %tmp23408 = getelementptr inbounds float* %tmp23407, i64 1
+  %tmp23409 = getelementptr inbounds float* %tmp23408, i64 1
+  %tmp23410 = getelementptr inbounds float* %tmp23409, i64 1
+  %tmp23411 = getelementptr inbounds float* %tmp23410, i64 1
+  %tmp23412 = getelementptr inbounds float* %tmp23411, i64 1
+  %tmp23413 = getelementptr inbounds float* %tmp23412, i64 1
+  %tmp23414 = getelementptr inbounds float* %tmp23413, i64 1
+  %tmp23415 = getelementptr inbounds float* %tmp23414, i64 1
+  %tmp23416 = getelementptr inbounds float* %tmp23415, i64 1
+  %tmp23417 = getelementptr inbounds float* %tmp23416, i64 1
+  %tmp23418 = getelementptr inbounds float* %tmp23417, i64 1
+  %tmp23419 = getelementptr inbounds float* %tmp23418, i64 1
+  %tmp23420 = getelementptr inbounds float* %tmp23419, i64 1
+  %tmp23421 = getelementptr inbounds float* %tmp23420, i64 1
+  %tmp23422 = getelementptr inbounds float* %tmp23421, i64 1
+  %tmp23423 = getelementptr inbounds float* %tmp23422, i64 1
+  %tmp23424 = getelementptr inbounds float* %tmp23423, i64 1
+  %tmp23425 = getelementptr inbounds float* %tmp23424, i64 1
+  %tmp23426 = getelementptr inbounds float* %tmp23425, i64 1
+  %tmp23427 = getelementptr inbounds float* %tmp23426, i64 1
+  %tmp23428 = getelementptr inbounds float* %tmp23427, i64 1
+  %tmp23429 = getelementptr inbounds float* %tmp23428, i64 1
+  %tmp23430 = getelementptr inbounds float* %tmp23429, i64 1
+  %tmp23431 = getelementptr inbounds float* %tmp23430, i64 1
+  %tmp23432 = getelementptr inbounds float* %tmp23431, i64 1
+  %tmp23433 = getelementptr inbounds float* %tmp23432, i64 1
+  %tmp23434 = getelementptr inbounds float* %tmp23433, i64 1
+  %tmp23435 = getelementptr inbounds float* %tmp23434, i64 1
+  %tmp23436 = getelementptr inbounds float* %tmp23435, i64 1
+  %tmp23437 = getelementptr inbounds float* %tmp23436, i64 1
+  %tmp23438 = getelementptr inbounds float* %tmp23437, i64 1
+  %tmp23439 = getelementptr inbounds float* %tmp23438, i64 1
+  %tmp23440 = getelementptr inbounds float* %tmp23439, i64 1
+  %tmp23441 = getelementptr inbounds float* %tmp23440, i64 1
+  %tmp23442 = getelementptr inbounds float* %tmp23441, i64 1
+  %tmp23443 = getelementptr inbounds float* %tmp23442, i64 1
+  %tmp23444 = getelementptr inbounds float* %tmp23443, i64 1
+  %tmp23445 = getelementptr inbounds float* %tmp23444, i64 1
+  %tmp23446 = getelementptr inbounds float* %tmp23445, i64 1
+  %tmp23447 = getelementptr inbounds float* %tmp23446, i64 1
+  %tmp23448 = getelementptr inbounds float* %tmp23447, i64 1
+  %tmp23449 = getelementptr inbounds float* %tmp23448, i64 1
+  %tmp23450 = getelementptr inbounds float* %tmp23449, i64 1
+  %tmp23451 = getelementptr inbounds float* %tmp23450, i64 1
+  %tmp23452 = getelementptr inbounds float* %tmp23451, i64 1
+  %tmp23453 = getelementptr inbounds float* %tmp23452, i64 1
+  %tmp23454 = getelementptr inbounds float* %tmp23453, i64 1
+  %tmp23455 = getelementptr inbounds float* %tmp23454, i64 1
+  %tmp23456 = getelementptr inbounds float* %tmp23455, i64 1
+  %tmp23457 = getelementptr inbounds float* %tmp23456, i64 1
+  %tmp23458 = getelementptr inbounds float* %tmp23457, i64 1
+  %tmp23459 = getelementptr inbounds float* %tmp23458, i64 1
+  %tmp23460 = getelementptr inbounds float* %tmp23459, i64 1
+  %tmp23461 = getelementptr inbounds float* %tmp23460, i64 1
+  %tmp23462 = getelementptr inbounds float* %tmp23461, i64 1
+  %tmp23463 = getelementptr inbounds float* %tmp23462, i64 1
+  %tmp23464 = getelementptr inbounds float* %tmp23463, i64 1
+  %tmp23465 = getelementptr inbounds float* %tmp23464, i64 1
+  %tmp23466 = getelementptr inbounds float* %tmp23465, i64 1
+  %tmp23467 = getelementptr inbounds float* %tmp23466, i64 1
+  %tmp23468 = getelementptr inbounds float* %tmp23467, i64 1
+  %tmp23469 = getelementptr inbounds float* %tmp23468, i64 1
+  %tmp23470 = getelementptr inbounds float* %tmp23469, i64 1
+  %tmp23471 = getelementptr inbounds float* %tmp23470, i64 1
+  %tmp23472 = getelementptr inbounds float* %tmp23471, i64 1
+  %tmp23473 = getelementptr inbounds float* %tmp23472, i64 1
+  %tmp23474 = getelementptr inbounds float* %tmp23473, i64 1
+  %tmp23475 = getelementptr inbounds float* %tmp23474, i64 1
+  %tmp23476 = getelementptr inbounds float* %tmp23475, i64 1
+  %tmp23477 = getelementptr inbounds float* %tmp23476, i64 1
+  %tmp23478 = getelementptr inbounds float* %tmp23477, i64 1
+  %tmp23479 = getelementptr inbounds float* %tmp23478, i64 1
+  %tmp23480 = getelementptr inbounds float* %tmp23479, i64 1
+  %tmp23481 = getelementptr inbounds float* %tmp23480, i64 1
+  %tmp23482 = getelementptr inbounds float* %tmp23481, i64 1
+  %tmp23483 = getelementptr inbounds float* %tmp23482, i64 1
+  %tmp23484 = getelementptr inbounds float* %tmp23483, i64 1
+  %tmp23485 = getelementptr inbounds float* %tmp23484, i64 1
+  %tmp23486 = getelementptr inbounds float* %tmp23485, i64 1
+  %tmp23487 = getelementptr inbounds float* %tmp23486, i64 1
+  %tmp23488 = getelementptr inbounds float* %tmp23487, i64 1
+  %tmp23489 = getelementptr inbounds float* %tmp23488, i64 1
+  %tmp23490 = getelementptr inbounds float* %tmp23489, i64 1
+  %tmp23491 = getelementptr inbounds float* %tmp23490, i64 1
+  %tmp23492 = getelementptr inbounds float* %tmp23491, i64 1
+  %tmp23493 = getelementptr inbounds float* %tmp23492, i64 1
+  %tmp23494 = getelementptr inbounds float* %tmp23493, i64 1
+  %tmp23495 = getelementptr inbounds float* %tmp23494, i64 1
+  %tmp23496 = getelementptr inbounds float* %tmp23495, i64 1
+  %tmp23497 = getelementptr inbounds float* %tmp23496, i64 1
+  %tmp23498 = getelementptr inbounds float* %tmp23497, i64 1
+  %tmp23499 = getelementptr inbounds float* %tmp23498, i64 1
+  %tmp23500 = getelementptr inbounds float* %tmp23499, i64 1
+  %tmp23501 = getelementptr inbounds float* %tmp23500, i64 1
+  %tmp23502 = getelementptr inbounds float* %tmp23501, i64 1
+  %tmp23503 = getelementptr inbounds float* %tmp23502, i64 1
+  %tmp23504 = getelementptr inbounds float* %tmp23503, i64 1
+  %tmp23505 = getelementptr inbounds float* %tmp23504, i64 1
+  %tmp23506 = getelementptr inbounds float* %tmp23505, i64 1
+  %tmp23507 = getelementptr inbounds float* %tmp23506, i64 1
+  %tmp23508 = getelementptr inbounds float* %tmp23507, i64 1
+  %tmp23509 = getelementptr inbounds float* %tmp23508, i64 1
+  %tmp23510 = getelementptr inbounds float* %tmp23509, i64 1
+  %tmp23511 = getelementptr inbounds float* %tmp23510, i64 1
+  %tmp23512 = getelementptr inbounds float* %tmp23511, i64 1
+  %tmp23513 = getelementptr inbounds float* %tmp23512, i64 1
+  %tmp23514 = getelementptr inbounds float* %tmp23513, i64 1
+  %tmp23515 = getelementptr inbounds float* %tmp23514, i64 1
+  %tmp23516 = getelementptr inbounds float* %tmp23515, i64 1
+  %tmp23517 = getelementptr inbounds float* %tmp23516, i64 1
+  %tmp23518 = getelementptr inbounds float* %tmp23517, i64 1
+  %tmp23519 = getelementptr inbounds float* %tmp23518, i64 1
+  %tmp23520 = getelementptr inbounds float* %tmp23519, i64 1
+  %tmp23521 = getelementptr inbounds float* %tmp23520, i64 1
+  %tmp23522 = getelementptr inbounds float* %tmp23521, i64 1
+  %tmp23523 = getelementptr inbounds float* %tmp23522, i64 1
+  %tmp23524 = getelementptr inbounds float* %tmp23523, i64 1
+  %tmp23525 = getelementptr inbounds float* %tmp23524, i64 1
+  %tmp23526 = getelementptr inbounds float* %tmp23525, i64 1
+  %tmp23527 = getelementptr inbounds float* %tmp23526, i64 1
+  %tmp23528 = getelementptr inbounds float* %tmp23527, i64 1
+  %tmp23529 = getelementptr inbounds float* %tmp23528, i64 1
+  %tmp23530 = getelementptr inbounds float* %tmp23529, i64 1
+  %tmp23531 = getelementptr inbounds float* %tmp23530, i64 1
+  %tmp23532 = getelementptr inbounds float* %tmp23531, i64 1
+  %tmp23533 = getelementptr inbounds float* %tmp23532, i64 1
+  %tmp23534 = getelementptr inbounds float* %tmp23533, i64 1
+  %tmp23535 = getelementptr inbounds float* %tmp23534, i64 1
+  %tmp23536 = getelementptr inbounds float* %tmp23535, i64 1
+  %tmp23537 = getelementptr inbounds float* %tmp23536, i64 1
+  %tmp23538 = getelementptr inbounds float* %tmp23537, i64 1
+  %tmp23539 = getelementptr inbounds float* %tmp23538, i64 1
+  %tmp23540 = getelementptr inbounds float* %tmp23539, i64 1
+  %tmp23541 = getelementptr inbounds float* %tmp23540, i64 1
+  %tmp23542 = getelementptr inbounds float* %tmp23541, i64 1
+  %tmp23543 = getelementptr inbounds float* %tmp23542, i64 1
+  %tmp23544 = getelementptr inbounds float* %tmp23543, i64 1
+  %tmp23545 = getelementptr inbounds float* %tmp23544, i64 1
+  %tmp23546 = getelementptr inbounds float* %tmp23545, i64 1
+  %tmp23547 = getelementptr inbounds float* %tmp23546, i64 1
+  %tmp23548 = getelementptr inbounds float* %tmp23547, i64 1
+  %tmp23549 = getelementptr inbounds float* %tmp23548, i64 1
+  %tmp23550 = getelementptr inbounds float* %tmp23549, i64 1
+  %tmp23551 = getelementptr inbounds float* %tmp23550, i64 1
+  %tmp23552 = getelementptr inbounds float* %tmp23551, i64 1
+  %tmp23553 = getelementptr inbounds float* %tmp23552, i64 1
+  %tmp23554 = getelementptr inbounds float* %tmp23553, i64 1
+  %tmp23555 = getelementptr inbounds float* %tmp23554, i64 1
+  %tmp23556 = getelementptr inbounds float* %tmp23555, i64 1
+  %tmp23557 = getelementptr inbounds float* %tmp23556, i64 1
+  %tmp23558 = getelementptr inbounds float* %tmp23557, i64 1
+  %tmp23559 = getelementptr inbounds float* %tmp23558, i64 1
+  %tmp23560 = getelementptr inbounds float* %tmp23559, i64 1
+  %tmp23561 = getelementptr inbounds float* %tmp23560, i64 1
+  %tmp23562 = getelementptr inbounds float* %tmp23561, i64 1
+  %tmp23563 = getelementptr inbounds float* %tmp23562, i64 1
+  %tmp23564 = getelementptr inbounds float* %tmp23563, i64 1
+  %tmp23565 = getelementptr inbounds float* %tmp23564, i64 1
+  %tmp23566 = getelementptr inbounds float* %tmp23565, i64 1
+  %tmp23567 = getelementptr inbounds float* %tmp23566, i64 1
+  %tmp23568 = getelementptr inbounds float* %tmp23567, i64 1
+  %tmp23569 = getelementptr inbounds float* %tmp23568, i64 1
+  %tmp23570 = getelementptr inbounds float* %tmp23569, i64 1
+  %tmp23571 = getelementptr inbounds float* %tmp23570, i64 1
+  %tmp23572 = getelementptr inbounds float* %tmp23571, i64 1
+  %tmp23573 = getelementptr inbounds float* %tmp23572, i64 1
+  %tmp23574 = getelementptr inbounds float* %tmp23573, i64 1
+  %tmp23575 = getelementptr inbounds float* %tmp23574, i64 1
+  %tmp23576 = getelementptr inbounds float* %tmp23575, i64 1
+  %tmp23577 = getelementptr inbounds float* %tmp23576, i64 1
+  %tmp23578 = getelementptr inbounds float* %tmp23577, i64 1
+  %tmp23579 = getelementptr inbounds float* %tmp23578, i64 1
+  %tmp23580 = getelementptr inbounds float* %tmp23579, i64 1
+  %tmp23581 = getelementptr inbounds float* %tmp23580, i64 1
+  %tmp23582 = getelementptr inbounds float* %tmp23581, i64 1
+  %tmp23583 = getelementptr inbounds float* %tmp23582, i64 1
+  %tmp23584 = getelementptr inbounds float* %tmp23583, i64 1
+  %tmp23585 = getelementptr inbounds float* %tmp23584, i64 1
+  %tmp23586 = getelementptr inbounds float* %tmp23585, i64 1
+  %tmp23587 = getelementptr inbounds float* %tmp23586, i64 1
+  %tmp23588 = getelementptr inbounds float* %tmp23587, i64 1
+  %tmp23589 = getelementptr inbounds float* %tmp23588, i64 1
+  %tmp23590 = getelementptr inbounds float* %tmp23589, i64 1
+  %tmp23591 = getelementptr inbounds float* %tmp23590, i64 1
+  %tmp23592 = getelementptr inbounds float* %tmp23591, i64 1
+  %tmp23593 = getelementptr inbounds float* %tmp23592, i64 1
+  %tmp23594 = getelementptr inbounds float* %tmp23593, i64 1
+  %tmp23595 = getelementptr inbounds float* %tmp23594, i64 1
+  %tmp23596 = getelementptr inbounds float* %tmp23595, i64 1
+  %tmp23597 = getelementptr inbounds float* %tmp23596, i64 1
+  %tmp23598 = getelementptr inbounds float* %tmp23597, i64 1
+  %tmp23599 = getelementptr inbounds float* %tmp23598, i64 1
+  %tmp23600 = getelementptr inbounds float* %tmp23599, i64 1
+  %tmp23601 = getelementptr inbounds float* %tmp23600, i64 1
+  %tmp23602 = getelementptr inbounds float* %tmp23601, i64 1
+  %tmp23603 = getelementptr inbounds float* %tmp23602, i64 1
+  %tmp23604 = getelementptr inbounds float* %tmp23603, i64 1
+  %tmp23605 = getelementptr inbounds float* %tmp23604, i64 1
+  %tmp23606 = getelementptr inbounds float* %tmp23605, i64 1
+  %tmp23607 = getelementptr inbounds float* %tmp23606, i64 1
+  %tmp23608 = getelementptr inbounds float* %tmp23607, i64 1
+  %tmp23609 = getelementptr inbounds float* %tmp23608, i64 1
+  %tmp23610 = getelementptr inbounds float* %tmp23609, i64 1
+  %tmp23611 = getelementptr inbounds float* %tmp23610, i64 1
+  %tmp23612 = getelementptr inbounds float* %tmp23611, i64 1
+  %tmp23613 = getelementptr inbounds float* %tmp23612, i64 1
+  %tmp23614 = getelementptr inbounds float* %tmp23613, i64 1
+  %tmp23615 = getelementptr inbounds float* %tmp23614, i64 1
+  %tmp23616 = getelementptr inbounds float* %tmp23615, i64 1
+  %tmp23617 = getelementptr inbounds float* %tmp23616, i64 1
+  %tmp23618 = getelementptr inbounds float* %tmp23617, i64 1
+  %tmp23619 = getelementptr inbounds float* %tmp23618, i64 1
+  %tmp23620 = getelementptr inbounds float* %tmp23619, i64 1
+  %tmp23621 = getelementptr inbounds float* %tmp23620, i64 1
+  %tmp23622 = getelementptr inbounds float* %tmp23621, i64 1
+  %tmp23623 = getelementptr inbounds float* %tmp23622, i64 1
+  %tmp23624 = getelementptr inbounds float* %tmp23623, i64 1
+  %tmp23625 = getelementptr inbounds float* %tmp23624, i64 1
+  %tmp23626 = getelementptr inbounds float* %tmp23625, i64 1
+  %tmp23627 = getelementptr inbounds float* %tmp23626, i64 1
+  %tmp23628 = getelementptr inbounds float* %tmp23627, i64 1
+  %tmp23629 = getelementptr inbounds float* %tmp23628, i64 1
+  %tmp23630 = getelementptr inbounds float* %tmp23629, i64 1
+  %tmp23631 = getelementptr inbounds float* %tmp23630, i64 1
+  %tmp23632 = getelementptr inbounds float* %tmp23631, i64 1
+  %tmp23633 = getelementptr inbounds float* %tmp23632, i64 1
+  %tmp23634 = getelementptr inbounds float* %tmp23633, i64 1
+  %tmp23635 = getelementptr inbounds float* %tmp23634, i64 1
+  %tmp23636 = getelementptr inbounds float* %tmp23635, i64 1
+  %tmp23637 = getelementptr inbounds float* %tmp23636, i64 1
+  %tmp23638 = getelementptr inbounds float* %tmp23637, i64 1
+  %tmp23639 = getelementptr inbounds float* %tmp23638, i64 1
+  %tmp23640 = getelementptr inbounds float* %tmp23639, i64 1
+  %tmp23641 = getelementptr inbounds float* %tmp23640, i64 1
+  %tmp23642 = getelementptr inbounds float* %tmp23641, i64 1
+  %tmp23643 = getelementptr inbounds float* %tmp23642, i64 1
+  %tmp23644 = getelementptr inbounds float* %tmp23643, i64 1
+  %tmp23645 = getelementptr inbounds float* %tmp23644, i64 1
+  %tmp23646 = getelementptr inbounds float* %tmp23645, i64 1
+  %tmp23647 = getelementptr inbounds float* %tmp23646, i64 1
+  %tmp23648 = getelementptr inbounds float* %tmp23647, i64 1
+  %tmp23649 = getelementptr inbounds float* %tmp23648, i64 1
+  %tmp23650 = getelementptr inbounds float* %tmp23649, i64 1
+  %tmp23651 = getelementptr inbounds float* %tmp23650, i64 1
+  %tmp23652 = getelementptr inbounds float* %tmp23651, i64 1
+  %tmp23653 = getelementptr inbounds float* %tmp23652, i64 1
+  %tmp23654 = getelementptr inbounds float* %tmp23653, i64 1
+  %tmp23655 = getelementptr inbounds float* %tmp23654, i64 1
+  %tmp23656 = getelementptr inbounds float* %tmp23655, i64 1
+  %tmp23657 = getelementptr inbounds float* %tmp23656, i64 1
+  %tmp23658 = getelementptr inbounds float* %tmp23657, i64 1
+  %tmp23659 = getelementptr inbounds float* %tmp23658, i64 1
+  %tmp23660 = getelementptr inbounds float* %tmp23659, i64 1
+  %tmp23661 = getelementptr inbounds float* %tmp23660, i64 1
+  %tmp23662 = getelementptr inbounds float* %tmp23661, i64 1
+  %tmp23663 = getelementptr inbounds float* %tmp23662, i64 1
+  %tmp23664 = getelementptr inbounds float* %tmp23663, i64 1
+  %tmp23665 = getelementptr inbounds float* %tmp23664, i64 1
+  %tmp23666 = getelementptr inbounds float* %tmp23665, i64 1
+  %tmp23667 = getelementptr inbounds float* %tmp23666, i64 1
+  %tmp23668 = getelementptr inbounds float* %tmp23667, i64 1
+  %tmp23669 = getelementptr inbounds float* %tmp23668, i64 1
+  %tmp23670 = getelementptr inbounds float* %tmp23669, i64 1
+  %tmp23671 = getelementptr inbounds float* %tmp23670, i64 1
+  %tmp23672 = getelementptr inbounds float* %tmp23671, i64 1
+  %tmp23673 = getelementptr inbounds float* %tmp23672, i64 1
+  %tmp23674 = getelementptr inbounds float* %tmp23673, i64 1
+  %tmp23675 = getelementptr inbounds float* %tmp23674, i64 1
+  %tmp23676 = getelementptr inbounds float* %tmp23675, i64 1
+  %tmp23677 = getelementptr inbounds float* %tmp23676, i64 1
+  %tmp23678 = getelementptr inbounds float* %tmp23677, i64 1
+  %tmp23679 = getelementptr inbounds float* %tmp23678, i64 1
+  %tmp23680 = getelementptr inbounds float* %tmp23679, i64 1
+  %tmp23681 = getelementptr inbounds float* %tmp23680, i64 1
+  %tmp23682 = getelementptr inbounds float* %tmp23681, i64 1
+  %tmp23683 = getelementptr inbounds float* %tmp23682, i64 1
+  %tmp23684 = getelementptr inbounds float* %tmp23683, i64 1
+  %tmp23685 = getelementptr inbounds float* %tmp23684, i64 1
+  %tmp23686 = getelementptr inbounds float* %tmp23685, i64 1
+  %tmp23687 = getelementptr inbounds float* %tmp23686, i64 1
+  %tmp23688 = getelementptr inbounds float* %tmp23687, i64 1
+  %tmp23689 = getelementptr inbounds float* %tmp23688, i64 1
+  %tmp23690 = getelementptr inbounds float* %tmp23689, i64 1
+  %tmp23691 = getelementptr inbounds float* %tmp23690, i64 1
+  %tmp23692 = getelementptr inbounds float* %tmp23691, i64 1
+  %tmp23693 = getelementptr inbounds float* %tmp23692, i64 1
+  %tmp23694 = getelementptr inbounds float* %tmp23693, i64 1
+  %tmp23695 = getelementptr inbounds float* %tmp23694, i64 1
+  %tmp23696 = getelementptr inbounds float* %tmp23695, i64 1
+  %tmp23697 = getelementptr inbounds float* %tmp23696, i64 1
+  %tmp23698 = getelementptr inbounds float* %tmp23697, i64 1
+  %tmp23699 = getelementptr inbounds float* %tmp23698, i64 1
+  %tmp23700 = getelementptr inbounds float* %tmp23699, i64 1
+  %tmp23701 = getelementptr inbounds float* %tmp23700, i64 1
+  %tmp23702 = getelementptr inbounds float* %tmp23701, i64 1
+  %tmp23703 = getelementptr inbounds float* %tmp23702, i64 1
+  %tmp23704 = getelementptr inbounds float* %tmp23703, i64 1
+  %tmp23705 = getelementptr inbounds float* %tmp23704, i64 1
+  %tmp23706 = getelementptr inbounds float* %tmp23705, i64 1
+  %tmp23707 = getelementptr inbounds float* %tmp23706, i64 1
+  %tmp23708 = getelementptr inbounds float* %tmp23707, i64 1
+  %tmp23709 = getelementptr inbounds float* %tmp23708, i64 1
+  %tmp23710 = getelementptr inbounds float* %tmp23709, i64 1
+  %tmp23711 = getelementptr inbounds float* %tmp23710, i64 1
+  %tmp23712 = getelementptr inbounds float* %tmp23711, i64 1
+  %tmp23713 = getelementptr inbounds float* %tmp23712, i64 1
+  %tmp23714 = getelementptr inbounds float* %tmp23713, i64 1
+  %tmp23715 = getelementptr inbounds float* %tmp23714, i64 1
+  %tmp23716 = getelementptr inbounds float* %tmp23715, i64 1
+  %tmp23717 = getelementptr inbounds float* %tmp23716, i64 1
+  %tmp23718 = getelementptr inbounds float* %tmp23717, i64 1
+  %tmp23719 = getelementptr inbounds float* %tmp23718, i64 1
+  %tmp23720 = getelementptr inbounds float* %tmp23719, i64 1
+  %tmp23721 = getelementptr inbounds float* %tmp23720, i64 1
+  %tmp23722 = getelementptr inbounds float* %tmp23721, i64 1
+  %tmp23723 = getelementptr inbounds float* %tmp23722, i64 1
+  %tmp23724 = getelementptr inbounds float* %tmp23723, i64 1
+  %tmp23725 = getelementptr inbounds float* %tmp23724, i64 1
+  %tmp23726 = getelementptr inbounds float* %tmp23725, i64 1
+  %tmp23727 = getelementptr inbounds float* %tmp23726, i64 1
+  %tmp23728 = getelementptr inbounds float* %tmp23727, i64 1
+  %tmp23729 = getelementptr inbounds float* %tmp23728, i64 1
+  %tmp23730 = getelementptr inbounds float* %tmp23729, i64 1
+  %tmp23731 = getelementptr inbounds float* %tmp23730, i64 1
+  %tmp23732 = getelementptr inbounds float* %tmp23731, i64 1
+  %tmp23733 = getelementptr inbounds float* %tmp23732, i64 1
+  %tmp23734 = getelementptr inbounds float* %tmp23733, i64 1
+  %tmp23735 = getelementptr inbounds float* %tmp23734, i64 1
+  %tmp23736 = getelementptr inbounds float* %tmp23735, i64 1
+  %tmp23737 = getelementptr inbounds float* %tmp23736, i64 1
+  %tmp23738 = getelementptr inbounds float* %tmp23737, i64 1
+  %tmp23739 = getelementptr inbounds float* %tmp23738, i64 1
+  %tmp23740 = getelementptr inbounds float* %tmp23739, i64 1
+  %tmp23741 = getelementptr inbounds float* %tmp23740, i64 1
+  %tmp23742 = getelementptr inbounds float* %tmp23741, i64 1
+  %tmp23743 = getelementptr inbounds float* %tmp23742, i64 1
+  %tmp23744 = getelementptr inbounds float* %tmp23743, i64 1
+  %tmp23745 = getelementptr inbounds float* %tmp23744, i64 1
+  %tmp23746 = getelementptr inbounds float* %tmp23745, i64 1
+  %tmp23747 = getelementptr inbounds float* %tmp23746, i64 1
+  %tmp23748 = getelementptr inbounds float* %tmp23747, i64 1
+  %tmp23749 = getelementptr inbounds float* %tmp23748, i64 1
+  %tmp23750 = getelementptr inbounds float* %tmp23749, i64 1
+  %tmp23751 = getelementptr inbounds float* %tmp23750, i64 1
+  %tmp23752 = getelementptr inbounds float* %tmp23751, i64 1
+  %tmp23753 = getelementptr inbounds float* %tmp23752, i64 1
+  %tmp23754 = getelementptr inbounds float* %tmp23753, i64 1
+  %tmp23755 = getelementptr inbounds float* %tmp23754, i64 1
+  %tmp23756 = getelementptr inbounds float* %tmp23755, i64 1
+  %tmp23757 = getelementptr inbounds float* %tmp23756, i64 1
+  %tmp23758 = getelementptr inbounds float* %tmp23757, i64 1
+  %tmp23759 = getelementptr inbounds float* %tmp23758, i64 1
+  %tmp23760 = getelementptr inbounds float* %tmp23759, i64 1
+  %tmp23761 = getelementptr inbounds float* %tmp23760, i64 1
+  %tmp23762 = getelementptr inbounds float* %tmp23761, i64 1
+  %tmp23763 = getelementptr inbounds float* %tmp23762, i64 1
+  %tmp23764 = getelementptr inbounds float* %tmp23763, i64 1
+  %tmp23765 = getelementptr inbounds float* %tmp23764, i64 1
+  %tmp23766 = getelementptr inbounds float* %tmp23765, i64 1
+  %tmp23767 = getelementptr inbounds float* %tmp23766, i64 1
+  %tmp23768 = getelementptr inbounds float* %tmp23767, i64 1
+  %tmp23769 = getelementptr inbounds float* %tmp23768, i64 1
+  %tmp23770 = getelementptr inbounds float* %tmp23769, i64 1
+  %tmp23771 = getelementptr inbounds float* %tmp23770, i64 1
+  %tmp23772 = getelementptr inbounds float* %tmp23771, i64 1
+  %tmp23773 = getelementptr inbounds float* %tmp23772, i64 1
+  %tmp23774 = getelementptr inbounds float* %tmp23773, i64 1
+  %tmp23775 = getelementptr inbounds float* %tmp23774, i64 1
+  %tmp23776 = getelementptr inbounds float* %tmp23775, i64 1
+  %tmp23777 = getelementptr inbounds float* %tmp23776, i64 1
+  %tmp23778 = getelementptr inbounds float* %tmp23777, i64 1
+  %tmp23779 = getelementptr inbounds float* %tmp23778, i64 1
+  %tmp23780 = getelementptr inbounds float* %tmp23779, i64 1
+  %tmp23781 = getelementptr inbounds float* %tmp23780, i64 1
+  %tmp23782 = getelementptr inbounds float* %tmp23781, i64 1
+  %tmp23783 = getelementptr inbounds float* %tmp23782, i64 1
+  %tmp23784 = getelementptr inbounds float* %tmp23783, i64 1
+  %tmp23785 = getelementptr inbounds float* %tmp23784, i64 1
+  %tmp23786 = getelementptr inbounds float* %tmp23785, i64 1
+  %tmp23787 = getelementptr inbounds float* %tmp23786, i64 1
+  %tmp23788 = getelementptr inbounds float* %tmp23787, i64 1
+  %tmp23789 = getelementptr inbounds float* %tmp23788, i64 1
+  %tmp23790 = getelementptr inbounds float* %tmp23789, i64 1
+  %tmp23791 = getelementptr inbounds float* %tmp23790, i64 1
+  %tmp23792 = getelementptr inbounds float* %tmp23791, i64 1
+  %tmp23793 = getelementptr inbounds float* %tmp23792, i64 1
+  %tmp23794 = getelementptr inbounds float* %tmp23793, i64 1
+  %tmp23795 = getelementptr inbounds float* %tmp23794, i64 1
+  %tmp23796 = getelementptr inbounds float* %tmp23795, i64 1
+  %tmp23797 = getelementptr inbounds float* %tmp23796, i64 1
+  %tmp23798 = getelementptr inbounds float* %tmp23797, i64 1
+  %tmp23799 = getelementptr inbounds float* %tmp23798, i64 1
+  %tmp23800 = getelementptr inbounds float* %tmp23799, i64 1
+  %tmp23801 = getelementptr inbounds float* %tmp23800, i64 1
+  %tmp23802 = getelementptr inbounds float* %tmp23801, i64 1
+  %tmp23803 = getelementptr inbounds float* %tmp23802, i64 1
+  %tmp23804 = getelementptr inbounds float* %tmp23803, i64 1
+  %tmp23805 = getelementptr inbounds float* %tmp23804, i64 1
+  %tmp23806 = getelementptr inbounds float* %tmp23805, i64 1
+  %tmp23807 = getelementptr inbounds float* %tmp23806, i64 1
+  %tmp23808 = getelementptr inbounds float* %tmp23807, i64 1
+  %tmp23809 = getelementptr inbounds float* %tmp23808, i64 1
+  %tmp23810 = getelementptr inbounds float* %tmp23809, i64 1
+  %tmp23811 = getelementptr inbounds float* %tmp23810, i64 1
+  %tmp23812 = getelementptr inbounds float* %tmp23811, i64 1
+  %tmp23813 = getelementptr inbounds float* %tmp23812, i64 1
+  %tmp23814 = getelementptr inbounds float* %tmp23813, i64 1
+  %tmp23815 = getelementptr inbounds float* %tmp23814, i64 1
+  %tmp23816 = getelementptr inbounds float* %tmp23815, i64 1
+  %tmp23817 = getelementptr inbounds float* %tmp23816, i64 1
+  %tmp23818 = getelementptr inbounds float* %tmp23817, i64 1
+  %tmp23819 = getelementptr inbounds float* %tmp23818, i64 1
+  %tmp23820 = getelementptr inbounds float* %tmp23819, i64 1
+  %tmp23821 = getelementptr inbounds float* %tmp23820, i64 1
+  %tmp23822 = getelementptr inbounds float* %tmp23821, i64 1
+  %tmp23823 = getelementptr inbounds float* %tmp23822, i64 1
+  %tmp23824 = getelementptr inbounds float* %tmp23823, i64 1
+  %tmp23825 = getelementptr inbounds float* %tmp23824, i64 1
+  %tmp23826 = getelementptr inbounds float* %tmp23825, i64 1
+  %tmp23827 = getelementptr inbounds float* %tmp23826, i64 1
+  %tmp23828 = getelementptr inbounds float* %tmp23827, i64 1
+  %tmp23829 = getelementptr inbounds float* %tmp23828, i64 1
+  %tmp23830 = getelementptr inbounds float* %tmp23829, i64 1
+  %tmp23831 = getelementptr inbounds float* %tmp23830, i64 1
+  %tmp23832 = getelementptr inbounds float* %tmp23831, i64 1
+  %tmp23833 = getelementptr inbounds float* %tmp23832, i64 1
+  %tmp23834 = getelementptr inbounds float* %tmp23833, i64 1
+  %tmp23835 = getelementptr inbounds float* %tmp23834, i64 1
+  %tmp23836 = getelementptr inbounds float* %tmp23835, i64 1
+  %tmp23837 = getelementptr inbounds float* %tmp23836, i64 1
+  %tmp23838 = getelementptr inbounds float* %tmp23837, i64 1
+  %tmp23839 = getelementptr inbounds float* %tmp23838, i64 1
+  %tmp23840 = getelementptr inbounds float* %tmp23839, i64 1
+  %tmp23841 = getelementptr inbounds float* %tmp23840, i64 1
+  %tmp23842 = getelementptr inbounds float* %tmp23841, i64 1
+  %tmp23843 = getelementptr inbounds float* %tmp23842, i64 1
+  %tmp23844 = getelementptr inbounds float* %tmp23843, i64 1
+  %tmp23845 = getelementptr inbounds float* %tmp23844, i64 1
+  %tmp23846 = getelementptr inbounds float* %tmp23845, i64 1
+  %tmp23847 = getelementptr inbounds float* %tmp23846, i64 1
+  %tmp23848 = getelementptr inbounds float* %tmp23847, i64 1
+  %tmp23849 = getelementptr inbounds float* %tmp23848, i64 1
+  %tmp23850 = getelementptr inbounds float* %tmp23849, i64 1
+  %tmp23851 = getelementptr inbounds float* %tmp23850, i64 1
+  %tmp23852 = getelementptr inbounds float* %tmp23851, i64 1
+  %tmp23853 = getelementptr inbounds float* %tmp23852, i64 1
+  %tmp23854 = getelementptr inbounds float* %tmp23853, i64 1
+  %tmp23855 = getelementptr inbounds float* %tmp23854, i64 1
+  %tmp23856 = getelementptr inbounds float* %tmp23855, i64 1
+  %tmp23857 = getelementptr inbounds float* %tmp23856, i64 1
+  %tmp23858 = getelementptr inbounds float* %tmp23857, i64 1
+  %tmp23859 = getelementptr inbounds float* %tmp23858, i64 1
+  %tmp23860 = getelementptr inbounds float* %tmp23859, i64 1
+  %tmp23861 = getelementptr inbounds float* %tmp23860, i64 1
+  %tmp23862 = getelementptr inbounds float* %tmp23861, i64 1
+  %tmp23863 = getelementptr inbounds float* %tmp23862, i64 1
+  %tmp23864 = getelementptr inbounds float* %tmp23863, i64 1
+  %tmp23865 = getelementptr inbounds float* %tmp23864, i64 1
+  %tmp23866 = getelementptr inbounds float* %tmp23865, i64 1
+  %tmp23867 = getelementptr inbounds float* %tmp23866, i64 1
+  %tmp23868 = getelementptr inbounds float* %tmp23867, i64 1
+  %tmp23869 = getelementptr inbounds float* %tmp23868, i64 1
+  %tmp23870 = getelementptr inbounds float* %tmp23869, i64 1
+  %tmp23871 = getelementptr inbounds float* %tmp23870, i64 1
+  %tmp23872 = getelementptr inbounds float* %tmp23871, i64 1
+  %tmp23873 = getelementptr inbounds float* %tmp23872, i64 1
+  %tmp23874 = getelementptr inbounds float* %tmp23873, i64 1
+  %tmp23875 = getelementptr inbounds float* %tmp23874, i64 1
+  %tmp23876 = getelementptr inbounds float* %tmp23875, i64 1
+  %tmp23877 = getelementptr inbounds float* %tmp23876, i64 1
+  %tmp23878 = getelementptr inbounds float* %tmp23877, i64 1
+  %tmp23879 = getelementptr inbounds float* %tmp23878, i64 1
+  %tmp23880 = getelementptr inbounds float* %tmp23879, i64 1
+  %tmp23881 = getelementptr inbounds float* %tmp23880, i64 1
+  %tmp23882 = getelementptr inbounds float* %tmp23881, i64 1
+  %tmp23883 = getelementptr inbounds float* %tmp23882, i64 1
+  %tmp23884 = getelementptr inbounds float* %tmp23883, i64 1
+  %tmp23885 = getelementptr inbounds float* %tmp23884, i64 1
+  %tmp23886 = getelementptr inbounds float* %tmp23885, i64 1
+  %tmp23887 = getelementptr inbounds float* %tmp23886, i64 1
+  %tmp23888 = getelementptr inbounds float* %tmp23887, i64 1
+  %tmp23889 = getelementptr inbounds float* %tmp23888, i64 1
+  %tmp23890 = getelementptr inbounds float* %tmp23889, i64 1
+  %tmp23891 = getelementptr inbounds float* %tmp23890, i64 1
+  %tmp23892 = getelementptr inbounds float* %tmp23891, i64 1
+  %tmp23893 = getelementptr inbounds float* %tmp23892, i64 1
+  %tmp23894 = getelementptr inbounds float* %tmp23893, i64 1
+  %tmp23895 = getelementptr inbounds float* %tmp23894, i64 1
+  %tmp23896 = getelementptr inbounds float* %tmp23895, i64 1
+  %tmp23897 = getelementptr inbounds float* %tmp23896, i64 1
+  %tmp23898 = getelementptr inbounds float* %tmp23897, i64 1
+  %tmp23899 = getelementptr inbounds float* %tmp23898, i64 1
+  %tmp23900 = getelementptr inbounds float* %tmp23899, i64 1
+  %tmp23901 = getelementptr inbounds float* %tmp23900, i64 1
+  %tmp23902 = getelementptr inbounds float* %tmp23901, i64 1
+  %tmp23903 = getelementptr inbounds float* %tmp23902, i64 1
+  %tmp23904 = getelementptr inbounds float* %tmp23903, i64 1
+  %tmp23905 = getelementptr inbounds float* %tmp23904, i64 1
+  %tmp23906 = getelementptr inbounds float* %tmp23905, i64 1
+  %tmp23907 = getelementptr inbounds float* %tmp23906, i64 1
+  %tmp23908 = getelementptr inbounds float* %tmp23907, i64 1
+  %tmp23909 = getelementptr inbounds float* %tmp23908, i64 1
+  %tmp23910 = getelementptr inbounds float* %tmp23909, i64 1
+  %tmp23911 = getelementptr inbounds float* %tmp23910, i64 1
+  %tmp23912 = getelementptr inbounds float* %tmp23911, i64 1
+  %tmp23913 = getelementptr inbounds float* %tmp23912, i64 1
+  %tmp23914 = getelementptr inbounds float* %tmp23913, i64 1
+  %tmp23915 = getelementptr inbounds float* %tmp23914, i64 1
+  %tmp23916 = getelementptr inbounds float* %tmp23915, i64 1
+  %tmp23917 = getelementptr inbounds float* %tmp23916, i64 1
+  %tmp23918 = getelementptr inbounds float* %tmp23917, i64 1
+  %tmp23919 = getelementptr inbounds float* %tmp23918, i64 1
+  %tmp23920 = getelementptr inbounds float* %tmp23919, i64 1
+  %tmp23921 = getelementptr inbounds float* %tmp23920, i64 1
+  %tmp23922 = getelementptr inbounds float* %tmp23921, i64 1
+  %tmp23923 = getelementptr inbounds float* %tmp23922, i64 1
+  %tmp23924 = getelementptr inbounds float* %tmp23923, i64 1
+  %tmp23925 = getelementptr inbounds float* %tmp23924, i64 1
+  %tmp23926 = getelementptr inbounds float* %tmp23925, i64 1
+  %tmp23927 = getelementptr inbounds float* %tmp23926, i64 1
+  %tmp23928 = getelementptr inbounds float* %tmp23927, i64 1
+  %tmp23929 = getelementptr inbounds float* %tmp23928, i64 1
+  %tmp23930 = getelementptr inbounds float* %tmp23929, i64 1
+  %tmp23931 = getelementptr inbounds float* %tmp23930, i64 1
+  %tmp23932 = getelementptr inbounds float* %tmp23931, i64 1
+  %tmp23933 = getelementptr inbounds float* %tmp23932, i64 1
+  %tmp23934 = getelementptr inbounds float* %tmp23933, i64 1
+  %tmp23935 = getelementptr inbounds float* %tmp23934, i64 1
+  %tmp23936 = getelementptr inbounds float* %tmp23935, i64 1
+  %tmp23937 = getelementptr inbounds float* %tmp23936, i64 1
+  %tmp23938 = getelementptr inbounds float* %tmp23937, i64 1
+  %tmp23939 = getelementptr inbounds float* %tmp23938, i64 1
+  %tmp23940 = getelementptr inbounds float* %tmp23939, i64 1
+  %tmp23941 = getelementptr inbounds float* %tmp23940, i64 1
+  %tmp23942 = getelementptr inbounds float* %tmp23941, i64 1
+  %tmp23943 = getelementptr inbounds float* %tmp23942, i64 1
+  %tmp23944 = getelementptr inbounds float* %tmp23943, i64 1
+  %tmp23945 = getelementptr inbounds float* %tmp23944, i64 1
+  %tmp23946 = getelementptr inbounds float* %tmp23945, i64 1
+  %tmp23947 = getelementptr inbounds float* %tmp23946, i64 1
+  %tmp23948 = getelementptr inbounds float* %tmp23947, i64 1
+  %tmp23949 = getelementptr inbounds float* %tmp23948, i64 1
+  %tmp23950 = getelementptr inbounds float* %tmp23949, i64 1
+  %tmp23951 = getelementptr inbounds float* %tmp23950, i64 1
+  %tmp23952 = getelementptr inbounds float* %tmp23951, i64 1
+  %tmp23953 = getelementptr inbounds float* %tmp23952, i64 1
+  %tmp23954 = getelementptr inbounds float* %tmp23953, i64 1
+  %tmp23955 = getelementptr inbounds float* %tmp23954, i64 1
+  %tmp23956 = getelementptr inbounds float* %tmp23955, i64 1
+  %tmp23957 = getelementptr inbounds float* %tmp23956, i64 1
+  %tmp23958 = getelementptr inbounds float* %tmp23957, i64 1
+  %tmp23959 = getelementptr inbounds float* %tmp23958, i64 1
+  %tmp23960 = getelementptr inbounds float* %tmp23959, i64 1
+  %tmp23961 = getelementptr inbounds float* %tmp23960, i64 1
+  %tmp23962 = getelementptr inbounds float* %tmp23961, i64 1
+  %tmp23963 = getelementptr inbounds float* %tmp23962, i64 1
+  %tmp23964 = getelementptr inbounds float* %tmp23963, i64 1
+  %tmp23965 = getelementptr inbounds float* %tmp23964, i64 1
+  %tmp23966 = getelementptr inbounds float* %tmp23965, i64 1
+  %tmp23967 = getelementptr inbounds float* %tmp23966, i64 1
+  %tmp23968 = getelementptr inbounds float* %tmp23967, i64 1
+  %tmp23969 = getelementptr inbounds float* %tmp23968, i64 1
+  %tmp23970 = getelementptr inbounds float* %tmp23969, i64 1
+  %tmp23971 = getelementptr inbounds float* %tmp23970, i64 1
+  %tmp23972 = getelementptr inbounds float* %tmp23971, i64 1
+  %tmp23973 = getelementptr inbounds float* %tmp23972, i64 1
+  %tmp23974 = getelementptr inbounds float* %tmp23973, i64 1
+  %tmp23975 = getelementptr inbounds float* %tmp23974, i64 1
+  %tmp23976 = getelementptr inbounds float* %tmp23975, i64 1
+  %tmp23977 = getelementptr inbounds float* %tmp23976, i64 1
+  %tmp23978 = getelementptr inbounds float* %tmp23977, i64 1
+  %tmp23979 = getelementptr inbounds float* %tmp23978, i64 1
+  %tmp23980 = getelementptr inbounds float* %tmp23979, i64 1
+  %tmp23981 = getelementptr inbounds float* %tmp23980, i64 1
+  %tmp23982 = getelementptr inbounds float* %tmp23981, i64 1
+  %tmp23983 = getelementptr inbounds float* %tmp23982, i64 1
+  %tmp23984 = getelementptr inbounds float* %tmp23983, i64 1
+  %tmp23985 = getelementptr inbounds float* %tmp23984, i64 1
+  %tmp23986 = getelementptr inbounds float* %tmp23985, i64 1
+  %tmp23987 = getelementptr inbounds float* %tmp23986, i64 1
+  %tmp23988 = getelementptr inbounds float* %tmp23987, i64 1
+  %tmp23989 = getelementptr inbounds float* %tmp23988, i64 1
+  %tmp23990 = getelementptr inbounds float* %tmp23989, i64 1
+  %tmp23991 = getelementptr inbounds float* %tmp23990, i64 1
+  %tmp23992 = getelementptr inbounds float* %tmp23991, i64 1
+  %tmp23993 = getelementptr inbounds float* %tmp23992, i64 1
+  %tmp23994 = getelementptr inbounds float* %tmp23993, i64 1
+  %tmp23995 = getelementptr inbounds float* %tmp23994, i64 1
+  %tmp23996 = getelementptr inbounds float* %tmp23995, i64 1
+  %tmp23997 = getelementptr inbounds float* %tmp23996, i64 1
+  %tmp23998 = getelementptr inbounds float* %tmp23997, i64 1
+  %tmp23999 = getelementptr inbounds float* %tmp23998, i64 1
+  %tmp24000 = getelementptr inbounds float* %tmp23999, i64 1
+  %tmp24001 = getelementptr inbounds float* %tmp24000, i64 1
+  %tmp24002 = getelementptr inbounds float* %tmp24001, i64 1
+  %tmp24003 = getelementptr inbounds float* %tmp24002, i64 1
+  %tmp24004 = getelementptr inbounds float* %tmp24003, i64 1
+  %tmp24005 = getelementptr inbounds float* %tmp24004, i64 1
+  %tmp24006 = getelementptr inbounds float* %tmp24005, i64 1
+  %tmp24007 = getelementptr inbounds float* %tmp24006, i64 1
+  %tmp24008 = getelementptr inbounds float* %tmp24007, i64 1
+  %tmp24009 = getelementptr inbounds float* %tmp24008, i64 1
+  %tmp24010 = getelementptr inbounds float* %tmp24009, i64 1
+  %tmp24011 = getelementptr inbounds float* %tmp24010, i64 1
+  %tmp24012 = getelementptr inbounds float* %tmp24011, i64 1
+  %tmp24013 = getelementptr inbounds float* %tmp24012, i64 1
+  %tmp24014 = getelementptr inbounds float* %tmp24013, i64 1
+  %tmp24015 = getelementptr inbounds float* %tmp24014, i64 1
+  %tmp24016 = getelementptr inbounds float* %tmp24015, i64 1
+  %tmp24017 = getelementptr inbounds float* %tmp24016, i64 1
+  %tmp24018 = getelementptr inbounds float* %tmp24017, i64 1
+  %tmp24019 = getelementptr inbounds float* %tmp24018, i64 1
+  %tmp24020 = getelementptr inbounds float* %tmp24019, i64 1
+  %tmp24021 = getelementptr inbounds float* %tmp24020, i64 1
+  %tmp24022 = getelementptr inbounds float* %tmp24021, i64 1
+  %tmp24023 = getelementptr inbounds float* %tmp24022, i64 1
+  %tmp24024 = getelementptr inbounds float* %tmp24023, i64 1
+  %tmp24025 = getelementptr inbounds float* %tmp24024, i64 1
+  %tmp24026 = getelementptr inbounds float* %tmp24025, i64 1
+  %tmp24027 = getelementptr inbounds float* %tmp24026, i64 1
+  %tmp24028 = getelementptr inbounds float* %tmp24027, i64 1
+  %tmp24029 = getelementptr inbounds float* %tmp24028, i64 1
+  %tmp24030 = getelementptr inbounds float* %tmp24029, i64 1
+  %tmp24031 = getelementptr inbounds float* %tmp24030, i64 1
+  %tmp24032 = getelementptr inbounds float* %tmp24031, i64 1
+  %tmp24033 = getelementptr inbounds float* %tmp24032, i64 1
+  %tmp24034 = getelementptr inbounds float* %tmp24033, i64 1
+  %tmp24035 = getelementptr inbounds float* %tmp24034, i64 1
+  %tmp24036 = getelementptr inbounds float* %tmp24035, i64 1
+  %tmp24037 = getelementptr inbounds float* %tmp24036, i64 1
+  %tmp24038 = getelementptr inbounds float* %tmp24037, i64 1
+  %tmp24039 = getelementptr inbounds float* %tmp24038, i64 1
+  %tmp24040 = getelementptr inbounds float* %tmp24039, i64 1
+  %tmp24041 = getelementptr inbounds float* %tmp24040, i64 1
+  %tmp24042 = getelementptr inbounds float* %tmp24041, i64 1
+  %tmp24043 = getelementptr inbounds float* %tmp24042, i64 1
+  %tmp24044 = getelementptr inbounds float* %tmp24043, i64 1
+  %tmp24045 = getelementptr inbounds float* %tmp24044, i64 1
+  %tmp24046 = getelementptr inbounds float* %tmp24045, i64 1
+  %tmp24047 = getelementptr inbounds float* %tmp24046, i64 1
+  %tmp24048 = getelementptr inbounds float* %tmp24047, i64 1
+  %tmp24049 = getelementptr inbounds float* %tmp24048, i64 1
+  %tmp24050 = getelementptr inbounds float* %tmp24049, i64 1
+  %tmp24051 = getelementptr inbounds float* %tmp24050, i64 1
+  %tmp24052 = getelementptr inbounds float* %tmp24051, i64 1
+  %tmp24053 = getelementptr inbounds float* %tmp24052, i64 1
+  %tmp24054 = getelementptr inbounds float* %tmp24053, i64 1
+  %tmp24055 = getelementptr inbounds float* %tmp24054, i64 1
+  %tmp24056 = getelementptr inbounds float* %tmp24055, i64 1
+  %tmp24057 = getelementptr inbounds float* %tmp24056, i64 1
+  %tmp24058 = getelementptr inbounds float* %tmp24057, i64 1
+  %tmp24059 = getelementptr inbounds float* %tmp24058, i64 1
+  %tmp24060 = getelementptr inbounds float* %tmp24059, i64 1
+  %tmp24061 = getelementptr inbounds float* %tmp24060, i64 1
+  %tmp24062 = getelementptr inbounds float* %tmp24061, i64 1
+  %tmp24063 = getelementptr inbounds float* %tmp24062, i64 1
+  %tmp24064 = getelementptr inbounds float* %tmp24063, i64 1
+  %tmp24065 = getelementptr inbounds float* %tmp24064, i64 1
+  %tmp24066 = getelementptr inbounds float* %tmp24065, i64 1
+  %tmp24067 = getelementptr inbounds float* %tmp24066, i64 1
+  %tmp24068 = getelementptr inbounds float* %tmp24067, i64 1
+  %tmp24069 = getelementptr inbounds float* %tmp24068, i64 1
+  %tmp24070 = getelementptr inbounds float* %tmp24069, i64 1
+  %tmp24071 = getelementptr inbounds float* %tmp24070, i64 1
+  %tmp24072 = getelementptr inbounds float* %tmp24071, i64 1
+  %tmp24073 = getelementptr inbounds float* %tmp24072, i64 1
+  %tmp24074 = getelementptr inbounds float* %tmp24073, i64 1
+  %tmp24075 = getelementptr inbounds float* %tmp24074, i64 1
+  %tmp24076 = getelementptr inbounds float* %tmp24075, i64 1
+  %tmp24077 = getelementptr inbounds float* %tmp24076, i64 1
+  %tmp24078 = getelementptr inbounds float* %tmp24077, i64 1
+  %tmp24079 = getelementptr inbounds float* %tmp24078, i64 1
+  %tmp24080 = getelementptr inbounds float* %tmp24079, i64 1
+  %tmp24081 = getelementptr inbounds float* %tmp24080, i64 1
+  %tmp24082 = getelementptr inbounds float* %tmp24081, i64 1
+  %tmp24083 = getelementptr inbounds float* %tmp24082, i64 1
+  %tmp24084 = getelementptr inbounds float* %tmp24083, i64 1
+  %tmp24085 = getelementptr inbounds float* %tmp24084, i64 1
+  %tmp24086 = getelementptr inbounds float* %tmp24085, i64 1
+  %tmp24087 = getelementptr inbounds float* %tmp24086, i64 1
+  %tmp24088 = getelementptr inbounds float* %tmp24087, i64 1
+  %tmp24089 = getelementptr inbounds float* %tmp24088, i64 1
+  %tmp24090 = getelementptr inbounds float* %tmp24089, i64 1
+  %tmp24091 = getelementptr inbounds float* %tmp24090, i64 1
+  %tmp24092 = getelementptr inbounds float* %tmp24091, i64 1
+  %tmp24093 = getelementptr inbounds float* %tmp24092, i64 1
+  %tmp24094 = getelementptr inbounds float* %tmp24093, i64 1
+  %tmp24095 = getelementptr inbounds float* %tmp24094, i64 1
+  %tmp24096 = getelementptr inbounds float* %tmp24095, i64 1
+  %tmp24097 = getelementptr inbounds float* %tmp24096, i64 1
+  %tmp24098 = getelementptr inbounds float* %tmp24097, i64 1
+  %tmp24099 = getelementptr inbounds float* %tmp24098, i64 1
+  %tmp24100 = getelementptr inbounds float* %tmp24099, i64 1
+  %tmp24101 = getelementptr inbounds float* %tmp24100, i64 1
+  %tmp24102 = getelementptr inbounds float* %tmp24101, i64 1
+  %tmp24103 = getelementptr inbounds float* %tmp24102, i64 1
+  %tmp24104 = getelementptr inbounds float* %tmp24103, i64 1
+  %tmp24105 = getelementptr inbounds float* %tmp24104, i64 1
+  %tmp24106 = getelementptr inbounds float* %tmp24105, i64 1
+  %tmp24107 = getelementptr inbounds float* %tmp24106, i64 1
+  %tmp24108 = getelementptr inbounds float* %tmp24107, i64 1
+  %tmp24109 = getelementptr inbounds float* %tmp24108, i64 1
+  %tmp24110 = getelementptr inbounds float* %tmp24109, i64 1
+  %tmp24111 = getelementptr inbounds float* %tmp24110, i64 1
+  %tmp24112 = getelementptr inbounds float* %tmp24111, i64 1
+  %tmp24113 = getelementptr inbounds float* %tmp24112, i64 1
+  %tmp24114 = getelementptr inbounds float* %tmp24113, i64 1
+  %tmp24115 = getelementptr inbounds float* %tmp24114, i64 1
+  %tmp24116 = getelementptr inbounds float* %tmp24115, i64 1
+  %tmp24117 = getelementptr inbounds float* %tmp24116, i64 1
+  %tmp24118 = getelementptr inbounds float* %tmp24117, i64 1
+  %tmp24119 = getelementptr inbounds float* %tmp24118, i64 1
+  %tmp24120 = getelementptr inbounds float* %tmp24119, i64 1
+  %tmp24121 = getelementptr inbounds float* %tmp24120, i64 1
+  %tmp24122 = getelementptr inbounds float* %tmp24121, i64 1
+  %tmp24123 = getelementptr inbounds float* %tmp24122, i64 1
+  %tmp24124 = getelementptr inbounds float* %tmp24123, i64 1
+  %tmp24125 = getelementptr inbounds float* %tmp24124, i64 1
+  %tmp24126 = getelementptr inbounds float* %tmp24125, i64 1
+  %tmp24127 = getelementptr inbounds float* %tmp24126, i64 1
+  %tmp24128 = getelementptr inbounds float* %tmp24127, i64 1
+  %tmp24129 = getelementptr inbounds float* %tmp24128, i64 1
+  %tmp24130 = getelementptr inbounds float* %tmp24129, i64 1
+  %tmp24131 = getelementptr inbounds float* %tmp24130, i64 1
+  %tmp24132 = getelementptr inbounds float* %tmp24131, i64 1
+  %tmp24133 = getelementptr inbounds float* %tmp24132, i64 1
+  %tmp24134 = getelementptr inbounds float* %tmp24133, i64 1
+  %tmp24135 = getelementptr inbounds float* %tmp24134, i64 1
+  %tmp24136 = getelementptr inbounds float* %tmp24135, i64 1
+  %tmp24137 = getelementptr inbounds float* %tmp24136, i64 1
+  %tmp24138 = getelementptr inbounds float* %tmp24137, i64 1
+  %tmp24139 = getelementptr inbounds float* %tmp24138, i64 1
+  %tmp24140 = getelementptr inbounds float* %tmp24139, i64 1
+  %tmp24141 = getelementptr inbounds float* %tmp24140, i64 1
+  %tmp24142 = getelementptr inbounds float* %tmp24141, i64 1
+  %tmp24143 = getelementptr inbounds float* %tmp24142, i64 1
+  %tmp24144 = getelementptr inbounds float* %tmp24143, i64 1
+  %tmp24145 = getelementptr inbounds float* %tmp24144, i64 1
+  %tmp24146 = getelementptr inbounds float* %tmp24145, i64 1
+  %tmp24147 = getelementptr inbounds float* %tmp24146, i64 1
+  %tmp24148 = getelementptr inbounds float* %tmp24147, i64 1
+  %tmp24149 = getelementptr inbounds float* %tmp24148, i64 1
+  %tmp24150 = getelementptr inbounds float* %tmp24149, i64 1
+  %tmp24151 = getelementptr inbounds float* %tmp24150, i64 1
+  %tmp24152 = getelementptr inbounds float* %tmp24151, i64 1
+  %tmp24153 = getelementptr inbounds float* %tmp24152, i64 1
+  %tmp24154 = getelementptr inbounds float* %tmp24153, i64 1
+  %tmp24155 = getelementptr inbounds float* %tmp24154, i64 1
+  %tmp24156 = getelementptr inbounds float* %tmp24155, i64 1
+  %tmp24157 = getelementptr inbounds float* %tmp24156, i64 1
+  %tmp24158 = getelementptr inbounds float* %tmp24157, i64 1
+  %tmp24159 = getelementptr inbounds float* %tmp24158, i64 1
+  %tmp24160 = getelementptr inbounds float* %tmp24159, i64 1
+  %tmp24161 = getelementptr inbounds float* %tmp24160, i64 1
+  %tmp24162 = getelementptr inbounds float* %tmp24161, i64 1
+  %tmp24163 = getelementptr inbounds float* %tmp24162, i64 1
+  %tmp24164 = getelementptr inbounds float* %tmp24163, i64 1
+  %tmp24165 = getelementptr inbounds float* %tmp24164, i64 1
+  %tmp24166 = getelementptr inbounds float* %tmp24165, i64 1
+  %tmp24167 = getelementptr inbounds float* %tmp24166, i64 1
+  %tmp24168 = getelementptr inbounds float* %tmp24167, i64 1
+  %tmp24169 = getelementptr inbounds float* %tmp24168, i64 1
+  %tmp24170 = getelementptr inbounds float* %tmp24169, i64 1
+  %tmp24171 = getelementptr inbounds float* %tmp24170, i64 1
+  %tmp24172 = getelementptr inbounds float* %tmp24171, i64 1
+  %tmp24173 = getelementptr inbounds float* %tmp24172, i64 1
+  %tmp24174 = getelementptr inbounds float* %tmp24173, i64 1
+  %tmp24175 = getelementptr inbounds float* %tmp24174, i64 1
+  %tmp24176 = getelementptr inbounds float* %tmp24175, i64 1
+  %tmp24177 = getelementptr inbounds float* %tmp24176, i64 1
+  %tmp24178 = getelementptr inbounds float* %tmp24177, i64 1
+  %tmp24179 = getelementptr inbounds float* %tmp24178, i64 1
+  %tmp24180 = getelementptr inbounds float* %tmp24179, i64 1
+  %tmp24181 = getelementptr inbounds float* %tmp24180, i64 1
+  %tmp24182 = getelementptr inbounds float* %tmp24181, i64 1
+  %tmp24183 = getelementptr inbounds float* %tmp24182, i64 1
+  %tmp24184 = getelementptr inbounds float* %tmp24183, i64 1
+  %tmp24185 = getelementptr inbounds float* %tmp24184, i64 1
+  %tmp24186 = getelementptr inbounds float* %tmp24185, i64 1
+  %tmp24187 = getelementptr inbounds float* %tmp24186, i64 1
+  %tmp24188 = getelementptr inbounds float* %tmp24187, i64 1
+  %tmp24189 = getelementptr inbounds float* %tmp24188, i64 1
+  %tmp24190 = getelementptr inbounds float* %tmp24189, i64 1
+  %tmp24191 = getelementptr inbounds float* %tmp24190, i64 1
+  %tmp24192 = getelementptr inbounds float* %tmp24191, i64 1
+  %tmp24193 = getelementptr inbounds float* %tmp24192, i64 1
+  %tmp24194 = getelementptr inbounds float* %tmp24193, i64 1
+  %tmp24195 = getelementptr inbounds float* %tmp24194, i64 1
+  %tmp24196 = getelementptr inbounds float* %tmp24195, i64 1
+  %tmp24197 = getelementptr inbounds float* %tmp24196, i64 1
+  %tmp24198 = getelementptr inbounds float* %tmp24197, i64 1
+  %tmp24199 = getelementptr inbounds float* %tmp24198, i64 1
+  %tmp24200 = getelementptr inbounds float* %tmp24199, i64 1
+  %tmp24201 = getelementptr inbounds float* %tmp24200, i64 1
+  %tmp24202 = getelementptr inbounds float* %tmp24201, i64 1
+  %tmp24203 = getelementptr inbounds float* %tmp24202, i64 1
+  %tmp24204 = getelementptr inbounds float* %tmp24203, i64 1
+  %tmp24205 = getelementptr inbounds float* %tmp24204, i64 1
+  %tmp24206 = getelementptr inbounds float* %tmp24205, i64 1
+  %tmp24207 = getelementptr inbounds float* %tmp24206, i64 1
+  %tmp24208 = getelementptr inbounds float* %tmp24207, i64 1
+  %tmp24209 = getelementptr inbounds float* %tmp24208, i64 1
+  %tmp24210 = getelementptr inbounds float* %tmp24209, i64 1
+  %tmp24211 = getelementptr inbounds float* %tmp24210, i64 1
+  %tmp24212 = getelementptr inbounds float* %tmp24211, i64 1
+  %tmp24213 = getelementptr inbounds float* %tmp24212, i64 1
+  %tmp24214 = getelementptr inbounds float* %tmp24213, i64 1
+  %tmp24215 = getelementptr inbounds float* %tmp24214, i64 1
+  %tmp24216 = getelementptr inbounds float* %tmp24215, i64 1
+  %tmp24217 = getelementptr inbounds float* %tmp24216, i64 1
+  %tmp24218 = getelementptr inbounds float* %tmp24217, i64 1
+  %tmp24219 = getelementptr inbounds float* %tmp24218, i64 1
+  %tmp24220 = getelementptr inbounds float* %tmp24219, i64 1
+  %tmp24221 = getelementptr inbounds float* %tmp24220, i64 1
+  %tmp24222 = getelementptr inbounds float* %tmp24221, i64 1
+  %tmp24223 = getelementptr inbounds float* %tmp24222, i64 1
+  %tmp24224 = getelementptr inbounds float* %tmp24223, i64 1
+  %tmp24225 = getelementptr inbounds float* %tmp24224, i64 1
+  %tmp24226 = getelementptr inbounds float* %tmp24225, i64 1
+  %tmp24227 = getelementptr inbounds float* %tmp24226, i64 1
+  %tmp24228 = getelementptr inbounds float* %tmp24227, i64 1
+  %tmp24229 = getelementptr inbounds float* %tmp24228, i64 1
+  %tmp24230 = getelementptr inbounds float* %tmp24229, i64 1
+  %tmp24231 = getelementptr inbounds float* %tmp24230, i64 1
+  %tmp24232 = getelementptr inbounds float* %tmp24231, i64 1
+  %tmp24233 = getelementptr inbounds float* %tmp24232, i64 1
+  %tmp24234 = getelementptr inbounds float* %tmp24233, i64 1
+  %tmp24235 = getelementptr inbounds float* %tmp24234, i64 1
+  %tmp24236 = getelementptr inbounds float* %tmp24235, i64 1
+  %tmp24237 = getelementptr inbounds float* %tmp24236, i64 1
+  %tmp24238 = getelementptr inbounds float* %tmp24237, i64 1
+  %tmp24239 = getelementptr inbounds float* %tmp24238, i64 1
+  %tmp24240 = getelementptr inbounds float* %tmp24239, i64 1
+  %tmp24241 = getelementptr inbounds float* %tmp24240, i64 1
+  %tmp24242 = getelementptr inbounds float* %tmp24241, i64 1
+  %tmp24243 = getelementptr inbounds float* %tmp24242, i64 1
+  %tmp24244 = getelementptr inbounds float* %tmp24243, i64 1
+  %tmp24245 = getelementptr inbounds float* %tmp24244, i64 1
+  %tmp24246 = getelementptr inbounds float* %tmp24245, i64 1
+  %tmp24247 = getelementptr inbounds float* %tmp24246, i64 1
+  %tmp24248 = getelementptr inbounds float* %tmp24247, i64 1
+  %tmp24249 = getelementptr inbounds float* %tmp24248, i64 1
+  %tmp24250 = getelementptr inbounds float* %tmp24249, i64 1
+  %tmp24251 = getelementptr inbounds float* %tmp24250, i64 1
+  %tmp24252 = getelementptr inbounds float* %tmp24251, i64 1
+  %tmp24253 = getelementptr inbounds float* %tmp24252, i64 1
+  %tmp24254 = getelementptr inbounds float* %tmp24253, i64 1
+  %tmp24255 = getelementptr inbounds float* %tmp24254, i64 1
+  %tmp24256 = getelementptr inbounds float* %tmp24255, i64 1
+  %tmp24257 = getelementptr inbounds float* %tmp24256, i64 1
+  %tmp24258 = getelementptr inbounds float* %tmp24257, i64 1
+  %tmp24259 = getelementptr inbounds float* %tmp24258, i64 1
+  %tmp24260 = getelementptr inbounds float* %tmp24259, i64 1
+  %tmp24261 = getelementptr inbounds float* %tmp24260, i64 1
+  %tmp24262 = getelementptr inbounds float* %tmp24261, i64 1
+  %tmp24263 = getelementptr inbounds float* %tmp24262, i64 1
+  %tmp24264 = getelementptr inbounds float* %tmp24263, i64 1
+  %tmp24265 = getelementptr inbounds float* %tmp24264, i64 1
+  %tmp24266 = getelementptr inbounds float* %tmp24265, i64 1
+  %tmp24267 = getelementptr inbounds float* %tmp24266, i64 1
+  %tmp24268 = getelementptr inbounds float* %tmp24267, i64 1
+  %tmp24269 = getelementptr inbounds float* %tmp24268, i64 1
+  %tmp24270 = getelementptr inbounds float* %tmp24269, i64 1
+  %tmp24271 = getelementptr inbounds float* %tmp24270, i64 1
+  %tmp24272 = getelementptr inbounds float* %tmp24271, i64 1
+  %tmp24273 = getelementptr inbounds float* %tmp24272, i64 1
+  %tmp24274 = getelementptr inbounds float* %tmp24273, i64 1
+  %tmp24275 = getelementptr inbounds float* %tmp24274, i64 1
+  %tmp24276 = getelementptr inbounds float* %tmp24275, i64 1
+  %tmp24277 = getelementptr inbounds float* %tmp24276, i64 1
+  %tmp24278 = getelementptr inbounds float* %tmp24277, i64 1
+  %tmp24279 = getelementptr inbounds float* %tmp24278, i64 1
+  %tmp24280 = getelementptr inbounds float* %tmp24279, i64 1
+  %tmp24281 = getelementptr inbounds float* %tmp24280, i64 1
+  %tmp24282 = getelementptr inbounds float* %tmp24281, i64 1
+  %tmp24283 = getelementptr inbounds float* %tmp24282, i64 1
+  %tmp24284 = getelementptr inbounds float* %tmp24283, i64 1
+  %tmp24285 = getelementptr inbounds float* %tmp24284, i64 1
+  %tmp24286 = getelementptr inbounds float* %tmp24285, i64 1
+  %tmp24287 = getelementptr inbounds float* %tmp24286, i64 1
+  %tmp24288 = getelementptr inbounds float* %tmp24287, i64 1
+  %tmp24289 = getelementptr inbounds float* %tmp24288, i64 1
+  %tmp24290 = getelementptr inbounds float* %tmp24289, i64 1
+  %tmp24291 = getelementptr inbounds float* %tmp24290, i64 1
+  %tmp24292 = getelementptr inbounds float* %tmp24291, i64 1
+  %tmp24293 = getelementptr inbounds float* %tmp24292, i64 1
+  %tmp24294 = getelementptr inbounds float* %tmp24293, i64 1
+  %tmp24295 = getelementptr inbounds float* %tmp24294, i64 1
+  %tmp24296 = getelementptr inbounds float* %tmp24295, i64 1
+  %tmp24297 = getelementptr inbounds float* %tmp24296, i64 1
+  %tmp24298 = getelementptr inbounds float* %tmp24297, i64 1
+  %tmp24299 = getelementptr inbounds float* %tmp24298, i64 1
+  %tmp24300 = getelementptr inbounds float* %tmp24299, i64 1
+  %tmp24301 = getelementptr inbounds float* %tmp24300, i64 1
+  %tmp24302 = getelementptr inbounds float* %tmp24301, i64 1
+  %tmp24303 = getelementptr inbounds float* %tmp24302, i64 1
+  %tmp24304 = getelementptr inbounds float* %tmp24303, i64 1
+  %tmp24305 = getelementptr inbounds float* %tmp24304, i64 1
+  %tmp24306 = getelementptr inbounds float* %tmp24305, i64 1
+  %tmp24307 = getelementptr inbounds float* %tmp24306, i64 1
+  %tmp24308 = getelementptr inbounds float* %tmp24307, i64 1
+  %tmp24309 = getelementptr inbounds float* %tmp24308, i64 1
+  %tmp24310 = getelementptr inbounds float* %tmp24309, i64 1
+  %tmp24311 = getelementptr inbounds float* %tmp24310, i64 1
+  %tmp24312 = getelementptr inbounds float* %tmp24311, i64 1
+  %tmp24313 = getelementptr inbounds float* %tmp24312, i64 1
+  %tmp24314 = getelementptr inbounds float* %tmp24313, i64 1
+  %tmp24315 = getelementptr inbounds float* %tmp24314, i64 1
+  %tmp24316 = getelementptr inbounds float* %tmp24315, i64 1
+  %tmp24317 = getelementptr inbounds float* %tmp24316, i64 1
+  %tmp24318 = getelementptr inbounds float* %tmp24317, i64 1
+  %tmp24319 = getelementptr inbounds float* %tmp24318, i64 1
+  %tmp24320 = getelementptr inbounds float* %tmp24319, i64 1
+  %tmp24321 = getelementptr inbounds float* %tmp24320, i64 1
+  %tmp24322 = getelementptr inbounds float* %tmp24321, i64 1
+  %tmp24323 = getelementptr inbounds float* %tmp24322, i64 1
+  %tmp24324 = getelementptr inbounds float* %tmp24323, i64 1
+  %tmp24325 = getelementptr inbounds float* %tmp24324, i64 1
+  %tmp24326 = getelementptr inbounds float* %tmp24325, i64 1
+  %tmp24327 = getelementptr inbounds float* %tmp24326, i64 1
+  %tmp24328 = getelementptr inbounds float* %tmp24327, i64 1
+  %tmp24329 = getelementptr inbounds float* %tmp24328, i64 1
+  %tmp24330 = getelementptr inbounds float* %tmp24329, i64 1
+  %tmp24331 = getelementptr inbounds float* %tmp24330, i64 1
+  %tmp24332 = getelementptr inbounds float* %tmp24331, i64 1
+  %tmp24333 = getelementptr inbounds float* %tmp24332, i64 1
+  %tmp24334 = getelementptr inbounds float* %tmp24333, i64 1
+  %tmp24335 = getelementptr inbounds float* %tmp24334, i64 1
+  %tmp24336 = getelementptr inbounds float* %tmp24335, i64 1
+  %tmp24337 = getelementptr inbounds float* %tmp24336, i64 1
+  %tmp24338 = getelementptr inbounds float* %tmp24337, i64 1
+  %tmp24339 = getelementptr inbounds float* %tmp24338, i64 1
+  %tmp24340 = getelementptr inbounds float* %tmp24339, i64 1
+  %tmp24341 = getelementptr inbounds float* %tmp24340, i64 1
+  %tmp24342 = getelementptr inbounds float* %tmp24341, i64 1
+  %tmp24343 = getelementptr inbounds float* %tmp24342, i64 1
+  %tmp24344 = getelementptr inbounds float* %tmp24343, i64 1
+  %tmp24345 = getelementptr inbounds float* %tmp24344, i64 1
+  %tmp24346 = getelementptr inbounds float* %tmp24345, i64 1
+  %tmp24347 = getelementptr inbounds float* %tmp24346, i64 1
+  %tmp24348 = getelementptr inbounds float* %tmp24347, i64 1
+  %tmp24349 = getelementptr inbounds float* %tmp24348, i64 1
+  %tmp24350 = getelementptr inbounds float* %tmp24349, i64 1
+  %tmp24351 = getelementptr inbounds float* %tmp24350, i64 1
+  %tmp24352 = getelementptr inbounds float* %tmp24351, i64 1
+  %tmp24353 = getelementptr inbounds float* %tmp24352, i64 1
+  %tmp24354 = getelementptr inbounds float* %tmp24353, i64 1
+  %tmp24355 = getelementptr inbounds float* %tmp24354, i64 1
+  %tmp24356 = getelementptr inbounds float* %tmp24355, i64 1
+  %tmp24357 = getelementptr inbounds float* %tmp24356, i64 1
+  %tmp24358 = getelementptr inbounds float* %tmp24357, i64 1
+  %tmp24359 = getelementptr inbounds float* %tmp24358, i64 1
+  %tmp24360 = getelementptr inbounds float* %tmp24359, i64 1
+  %tmp24361 = getelementptr inbounds float* %tmp24360, i64 1
+  %tmp24362 = getelementptr inbounds float* %tmp24361, i64 1
+  %tmp24363 = getelementptr inbounds float* %tmp24362, i64 1
+  %tmp24364 = getelementptr inbounds float* %tmp24363, i64 1
+  %tmp24365 = getelementptr inbounds float* %tmp24364, i64 1
+  %tmp24366 = getelementptr inbounds float* %tmp24365, i64 1
+  %tmp24367 = getelementptr inbounds float* %tmp24366, i64 1
+  %tmp24368 = getelementptr inbounds float* %tmp24367, i64 1
+  %tmp24369 = getelementptr inbounds float* %tmp24368, i64 1
+  %tmp24370 = getelementptr inbounds float* %tmp24369, i64 1
+  %tmp24371 = getelementptr inbounds float* %tmp24370, i64 1
+  %tmp24372 = getelementptr inbounds float* %tmp24371, i64 1
+  %tmp24373 = getelementptr inbounds float* %tmp24372, i64 1
+  %tmp24374 = getelementptr inbounds float* %tmp24373, i64 1
+  %tmp24375 = getelementptr inbounds float* %tmp24374, i64 1
+  %tmp24376 = getelementptr inbounds float* %tmp24375, i64 1
+  %tmp24377 = getelementptr inbounds float* %tmp24376, i64 1
+  %tmp24378 = getelementptr inbounds float* %tmp24377, i64 1
+  %tmp24379 = getelementptr inbounds float* %tmp24378, i64 1
+  %tmp24380 = getelementptr inbounds float* %tmp24379, i64 1
+  %tmp24381 = getelementptr inbounds float* %tmp24380, i64 1
+  %tmp24382 = getelementptr inbounds float* %tmp24381, i64 1
+  %tmp24383 = getelementptr inbounds float* %tmp24382, i64 1
+  %tmp24384 = getelementptr inbounds float* %tmp24383, i64 1
+  %tmp24385 = getelementptr inbounds float* %tmp24384, i64 1
+  %tmp24386 = getelementptr inbounds float* %tmp24385, i64 1
+  %tmp24387 = getelementptr inbounds float* %tmp24386, i64 1
+  %tmp24388 = getelementptr inbounds float* %tmp24387, i64 1
+  %tmp24389 = getelementptr inbounds float* %tmp24388, i64 1
+  %tmp24390 = getelementptr inbounds float* %tmp24389, i64 1
+  %tmp24391 = getelementptr inbounds float* %tmp24390, i64 1
+  %tmp24392 = getelementptr inbounds float* %tmp24391, i64 1
+  %tmp24393 = getelementptr inbounds float* %tmp24392, i64 1
+  %tmp24394 = getelementptr inbounds float* %tmp24393, i64 1
+  %tmp24395 = getelementptr inbounds float* %tmp24394, i64 1
+  %tmp24396 = getelementptr inbounds float* %tmp24395, i64 1
+  %tmp24397 = getelementptr inbounds float* %tmp24396, i64 1
+  %tmp24398 = getelementptr inbounds float* %tmp24397, i64 1
+  %tmp24399 = getelementptr inbounds float* %tmp24398, i64 1
+  %tmp24400 = getelementptr inbounds float* %tmp24399, i64 1
+  %tmp24401 = getelementptr inbounds float* %tmp24400, i64 1
+  %tmp24402 = getelementptr inbounds float* %tmp24401, i64 1
+  %tmp24403 = getelementptr inbounds float* %tmp24402, i64 1
+  %tmp24404 = getelementptr inbounds float* %tmp24403, i64 1
+  %tmp24405 = getelementptr inbounds float* %tmp24404, i64 1
+  %tmp24406 = getelementptr inbounds float* %tmp24405, i64 1
+  %tmp24407 = getelementptr inbounds float* %tmp24406, i64 1
+  %tmp24408 = getelementptr inbounds float* %tmp24407, i64 1
+  %tmp24409 = getelementptr inbounds float* %tmp24408, i64 1
+  %tmp24410 = getelementptr inbounds float* %tmp24409, i64 1
+  %tmp24411 = getelementptr inbounds float* %tmp24410, i64 1
+  %tmp24412 = getelementptr inbounds float* %tmp24411, i64 1
+  %tmp24413 = getelementptr inbounds float* %tmp24412, i64 1
+  %tmp24414 = getelementptr inbounds float* %tmp24413, i64 1
+  %tmp24415 = getelementptr inbounds float* %tmp24414, i64 1
+  %tmp24416 = getelementptr inbounds float* %tmp24415, i64 1
+  %tmp24417 = getelementptr inbounds float* %tmp24416, i64 1
+  %tmp24418 = getelementptr inbounds float* %tmp24417, i64 1
+  %tmp24419 = getelementptr inbounds float* %tmp24418, i64 1
+  %tmp24420 = getelementptr inbounds float* %tmp24419, i64 1
+  %tmp24421 = getelementptr inbounds float* %tmp24420, i64 1
+  %tmp24422 = getelementptr inbounds float* %tmp24421, i64 1
+  %tmp24423 = getelementptr inbounds float* %tmp24422, i64 1
+  %tmp24424 = getelementptr inbounds float* %tmp24423, i64 1
+  %tmp24425 = getelementptr inbounds float* %tmp24424, i64 1
+  %tmp24426 = getelementptr inbounds float* %tmp24425, i64 1
+  %tmp24427 = getelementptr inbounds float* %tmp24426, i64 1
+  %tmp24428 = getelementptr inbounds float* %tmp24427, i64 1
+  %tmp24429 = getelementptr inbounds float* %tmp24428, i64 1
+  %tmp24430 = getelementptr inbounds float* %tmp24429, i64 1
+  %tmp24431 = getelementptr inbounds float* %tmp24430, i64 1
+  %tmp24432 = getelementptr inbounds float* %tmp24431, i64 1
+  %tmp24433 = getelementptr inbounds float* %tmp24432, i64 1
+  %tmp24434 = getelementptr inbounds float* %tmp24433, i64 1
+  %tmp24435 = getelementptr inbounds float* %tmp24434, i64 1
+  %tmp24436 = getelementptr inbounds float* %tmp24435, i64 1
+  %tmp24437 = getelementptr inbounds float* %tmp24436, i64 1
+  %tmp24438 = getelementptr inbounds float* %tmp24437, i64 1
+  %tmp24439 = getelementptr inbounds float* %tmp24438, i64 1
+  %tmp24440 = getelementptr inbounds float* %tmp24439, i64 1
+  %tmp24441 = getelementptr inbounds float* %tmp24440, i64 1
+  %tmp24442 = getelementptr inbounds float* %tmp24441, i64 1
+  %tmp24443 = getelementptr inbounds float* %tmp24442, i64 1
+  %tmp24444 = getelementptr inbounds float* %tmp24443, i64 1
+  %tmp24445 = getelementptr inbounds float* %tmp24444, i64 1
+  %tmp24446 = getelementptr inbounds float* %tmp24445, i64 1
+  %tmp24447 = getelementptr inbounds float* %tmp24446, i64 1
+  %tmp24448 = getelementptr inbounds float* %tmp24447, i64 1
+  %tmp24449 = getelementptr inbounds float* %tmp24448, i64 1
+  %tmp24450 = getelementptr inbounds float* %tmp24449, i64 1
+  %tmp24451 = getelementptr inbounds float* %tmp24450, i64 1
+  %tmp24452 = getelementptr inbounds float* %tmp24451, i64 1
+  %tmp24453 = getelementptr inbounds float* %tmp24452, i64 1
+  %tmp24454 = getelementptr inbounds float* %tmp24453, i64 1
+  %tmp24455 = getelementptr inbounds float* %tmp24454, i64 1
+  %tmp24456 = getelementptr inbounds float* %tmp24455, i64 1
+  %tmp24457 = getelementptr inbounds float* %tmp24456, i64 1
+  %tmp24458 = getelementptr inbounds float* %tmp24457, i64 1
+  %tmp24459 = getelementptr inbounds float* %tmp24458, i64 1
+  %tmp24460 = getelementptr inbounds float* %tmp24459, i64 1
+  %tmp24461 = getelementptr inbounds float* %tmp24460, i64 1
+  %tmp24462 = getelementptr inbounds float* %tmp24461, i64 1
+  %tmp24463 = getelementptr inbounds float* %tmp24462, i64 1
+  %tmp24464 = getelementptr inbounds float* %tmp24463, i64 1
+  %tmp24465 = getelementptr inbounds float* %tmp24464, i64 1
+  %tmp24466 = getelementptr inbounds float* %tmp24465, i64 1
+  %tmp24467 = getelementptr inbounds float* %tmp24466, i64 1
+  %tmp24468 = getelementptr inbounds float* %tmp24467, i64 1
+  %tmp24469 = getelementptr inbounds float* %tmp24468, i64 1
+  %tmp24470 = getelementptr inbounds float* %tmp24469, i64 1
+  %tmp24471 = getelementptr inbounds float* %tmp24470, i64 1
+  %tmp24472 = getelementptr inbounds float* %tmp24471, i64 1
+  %tmp24473 = getelementptr inbounds float* %tmp24472, i64 1
+  %tmp24474 = getelementptr inbounds float* %tmp24473, i64 1
+  %tmp24475 = getelementptr inbounds float* %tmp24474, i64 1
+  %tmp24476 = getelementptr inbounds float* %tmp24475, i64 1
+  %tmp24477 = getelementptr inbounds float* %tmp24476, i64 1
+  %tmp24478 = getelementptr inbounds float* %tmp24477, i64 1
+  %tmp24479 = getelementptr inbounds float* %tmp24478, i64 1
+  %tmp24480 = getelementptr inbounds float* %tmp24479, i64 1
+  %tmp24481 = getelementptr inbounds float* %tmp24480, i64 1
+  %tmp24482 = getelementptr inbounds float* %tmp24481, i64 1
+  %tmp24483 = getelementptr inbounds float* %tmp24482, i64 1
+  %tmp24484 = getelementptr inbounds float* %tmp24483, i64 1
+  %tmp24485 = getelementptr inbounds float* %tmp24484, i64 1
+  %tmp24486 = getelementptr inbounds float* %tmp24485, i64 1
+  %tmp24487 = getelementptr inbounds float* %tmp24486, i64 1
+  %tmp24488 = getelementptr inbounds float* %tmp24487, i64 1
+  %tmp24489 = getelementptr inbounds float* %tmp24488, i64 1
+  %tmp24490 = getelementptr inbounds float* %tmp24489, i64 1
+  %tmp24491 = getelementptr inbounds float* %tmp24490, i64 1
+  %tmp24492 = getelementptr inbounds float* %tmp24491, i64 1
+  %tmp24493 = getelementptr inbounds float* %tmp24492, i64 1
+  %tmp24494 = getelementptr inbounds float* %tmp24493, i64 1
+  %tmp24495 = getelementptr inbounds float* %tmp24494, i64 1
+  %tmp24496 = getelementptr inbounds float* %tmp24495, i64 1
+  %tmp24497 = getelementptr inbounds float* %tmp24496, i64 1
+  %tmp24498 = getelementptr inbounds float* %tmp24497, i64 1
+  %tmp24499 = getelementptr inbounds float* %tmp24498, i64 1
+  %tmp24500 = getelementptr inbounds float* %tmp24499, i64 1
+  %tmp24501 = getelementptr inbounds float* %tmp24500, i64 1
+  %tmp24502 = getelementptr inbounds float* %tmp24501, i64 1
+  %tmp24503 = getelementptr inbounds float* %tmp24502, i64 1
+  %tmp24504 = getelementptr inbounds float* %tmp24503, i64 1
+  %tmp24505 = getelementptr inbounds float* %tmp24504, i64 1
+  %tmp24506 = getelementptr inbounds float* %tmp24505, i64 1
+  %tmp24507 = getelementptr inbounds float* %tmp24506, i64 1
+  %tmp24508 = getelementptr inbounds float* %tmp24507, i64 1
+  %tmp24509 = getelementptr inbounds float* %tmp24508, i64 1
+  %tmp24510 = getelementptr inbounds float* %tmp24509, i64 1
+  %tmp24511 = getelementptr inbounds float* %tmp24510, i64 1
+  %tmp24512 = getelementptr inbounds float* %tmp24511, i64 1
+  %tmp24513 = getelementptr inbounds float* %tmp24512, i64 1
+  %tmp24514 = getelementptr inbounds float* %tmp24513, i64 1
+  %tmp24515 = getelementptr inbounds float* %tmp24514, i64 1
+  %tmp24516 = getelementptr inbounds float* %tmp24515, i64 1
+  %tmp24517 = getelementptr inbounds float* %tmp24516, i64 1
+  %tmp24518 = getelementptr inbounds float* %tmp24517, i64 1
+  %tmp24519 = getelementptr inbounds float* %tmp24518, i64 1
+  %tmp24520 = getelementptr inbounds float* %tmp24519, i64 1
+  %tmp24521 = getelementptr inbounds float* %tmp24520, i64 1
+  %tmp24522 = getelementptr inbounds float* %tmp24521, i64 1
+  %tmp24523 = getelementptr inbounds float* %tmp24522, i64 1
+  %tmp24524 = getelementptr inbounds float* %tmp24523, i64 1
+  %tmp24525 = getelementptr inbounds float* %tmp24524, i64 1
+  %tmp24526 = getelementptr inbounds float* %tmp24525, i64 1
+  %tmp24527 = getelementptr inbounds float* %tmp24526, i64 1
+  %tmp24528 = getelementptr inbounds float* %tmp24527, i64 1
+  %tmp24529 = getelementptr inbounds float* %tmp24528, i64 1
+  %tmp24530 = getelementptr inbounds float* %tmp24529, i64 1
+  %tmp24531 = getelementptr inbounds float* %tmp24530, i64 1
+  %tmp24532 = getelementptr inbounds float* %tmp24531, i64 1
+  %tmp24533 = getelementptr inbounds float* %tmp24532, i64 1
+  %tmp24534 = getelementptr inbounds float* %tmp24533, i64 1
+  %tmp24535 = getelementptr inbounds float* %tmp24534, i64 1
+  %tmp24536 = getelementptr inbounds float* %tmp24535, i64 1
+  %tmp24537 = getelementptr inbounds float* %tmp24536, i64 1
+  %tmp24538 = getelementptr inbounds float* %tmp24537, i64 1
+  %tmp24539 = getelementptr inbounds float* %tmp24538, i64 1
+  %tmp24540 = getelementptr inbounds float* %tmp24539, i64 1
+  %tmp24541 = getelementptr inbounds float* %tmp24540, i64 1
+  %tmp24542 = getelementptr inbounds float* %tmp24541, i64 1
+  %tmp24543 = getelementptr inbounds float* %tmp24542, i64 1
+  %tmp24544 = getelementptr inbounds float* %tmp24543, i64 1
+  %tmp24545 = getelementptr inbounds float* %tmp24544, i64 1
+  %tmp24546 = getelementptr inbounds float* %tmp24545, i64 1
+  %tmp24547 = getelementptr inbounds float* %tmp24546, i64 1
+  %tmp24548 = getelementptr inbounds float* %tmp24547, i64 1
+  %tmp24549 = getelementptr inbounds float* %tmp24548, i64 1
+  %tmp24550 = getelementptr inbounds float* %tmp24549, i64 1
+  %tmp24551 = getelementptr inbounds float* %tmp24550, i64 1
+  %tmp24552 = getelementptr inbounds float* %tmp24551, i64 1
+  %tmp24553 = getelementptr inbounds float* %tmp24552, i64 1
+  %tmp24554 = getelementptr inbounds float* %tmp24553, i64 1
+  %tmp24555 = getelementptr inbounds float* %tmp24554, i64 1
+  %tmp24556 = getelementptr inbounds float* %tmp24555, i64 1
+  %tmp24557 = getelementptr inbounds float* %tmp24556, i64 1
+  %tmp24558 = getelementptr inbounds float* %tmp24557, i64 1
+  %tmp24559 = getelementptr inbounds float* %tmp24558, i64 1
+  %tmp24560 = getelementptr inbounds float* %tmp24559, i64 1
+  %tmp24561 = getelementptr inbounds float* %tmp24560, i64 1
+  %tmp24562 = getelementptr inbounds float* %tmp24561, i64 1
+  %tmp24563 = getelementptr inbounds float* %tmp24562, i64 1
+  %tmp24564 = getelementptr inbounds float* %tmp24563, i64 1
+  %tmp24565 = getelementptr inbounds float* %tmp24564, i64 1
+  %tmp24566 = getelementptr inbounds float* %tmp24565, i64 1
+  %tmp24567 = getelementptr inbounds float* %tmp24566, i64 1
+  %tmp24568 = getelementptr inbounds float* %tmp24567, i64 1
+  %tmp24569 = getelementptr inbounds float* %tmp24568, i64 1
+  %tmp24570 = getelementptr inbounds float* %tmp24569, i64 1
+  %tmp24571 = getelementptr inbounds float* %tmp24570, i64 1
+  %tmp24572 = getelementptr inbounds float* %tmp24571, i64 1
+  %tmp24573 = getelementptr inbounds float* %tmp24572, i64 1
+  %tmp24574 = getelementptr inbounds float* %tmp24573, i64 1
+  %tmp24575 = getelementptr inbounds float* %tmp24574, i64 1
+  %tmp24576 = getelementptr inbounds float* %tmp24575, i64 1
+  %tmp24577 = getelementptr inbounds float* %tmp24576, i64 1
+  %tmp24578 = getelementptr inbounds float* %tmp24577, i64 1
+  %tmp24579 = getelementptr inbounds float* %tmp24578, i64 1
+  %tmp24580 = getelementptr inbounds float* %tmp24579, i64 1
+  %tmp24581 = getelementptr inbounds float* %tmp24580, i64 1
+  %tmp24582 = getelementptr inbounds float* %tmp24581, i64 1
+  %tmp24583 = getelementptr inbounds float* %tmp24582, i64 1
+  %tmp24584 = getelementptr inbounds float* %tmp24583, i64 1
+  %tmp24585 = getelementptr inbounds float* %tmp24584, i64 1
+  %tmp24586 = getelementptr inbounds float* %tmp24585, i64 1
+  %tmp24587 = getelementptr inbounds float* %tmp24586, i64 1
+  %tmp24588 = getelementptr inbounds float* %tmp24587, i64 1
+  %tmp24589 = getelementptr inbounds float* %tmp24588, i64 1
+  %tmp24590 = getelementptr inbounds float* %tmp24589, i64 1
+  %tmp24591 = getelementptr inbounds float* %tmp24590, i64 1
+  %tmp24592 = getelementptr inbounds float* %tmp24591, i64 1
+  %tmp24593 = getelementptr inbounds float* %tmp24592, i64 1
+  %tmp24594 = getelementptr inbounds float* %tmp24593, i64 1
+  %tmp24595 = getelementptr inbounds float* %tmp24594, i64 1
+  %tmp24596 = getelementptr inbounds float* %tmp24595, i64 1
+  %tmp24597 = getelementptr inbounds float* %tmp24596, i64 1
+  %tmp24598 = getelementptr inbounds float* %tmp24597, i64 1
+  %tmp24599 = getelementptr inbounds float* %tmp24598, i64 1
+  %tmp24600 = getelementptr inbounds float* %tmp24599, i64 1
+  %tmp24601 = getelementptr inbounds float* %tmp24600, i64 1
+  %tmp24602 = getelementptr inbounds float* %tmp24601, i64 1
+  %tmp24603 = getelementptr inbounds float* %tmp24602, i64 1
+  %tmp24604 = getelementptr inbounds float* %tmp24603, i64 1
+  %tmp24605 = getelementptr inbounds float* %tmp24604, i64 1
+  %tmp24606 = getelementptr inbounds float* %tmp24605, i64 1
+  %tmp24607 = getelementptr inbounds float* %tmp24606, i64 1
+  %tmp24608 = getelementptr inbounds float* %tmp24607, i64 1
+  %tmp24609 = getelementptr inbounds float* %tmp24608, i64 1
+  %tmp24610 = getelementptr inbounds float* %tmp24609, i64 1
+  %tmp24611 = getelementptr inbounds float* %tmp24610, i64 1
+  %tmp24612 = getelementptr inbounds float* %tmp24611, i64 1
+  %tmp24613 = getelementptr inbounds float* %tmp24612, i64 1
+  %tmp24614 = getelementptr inbounds float* %tmp24613, i64 1
+  %tmp24615 = getelementptr inbounds float* %tmp24614, i64 1
+  %tmp24616 = getelementptr inbounds float* %tmp24615, i64 1
+  %tmp24617 = getelementptr inbounds float* %tmp24616, i64 1
+  %tmp24618 = getelementptr inbounds float* %tmp24617, i64 1
+  %tmp24619 = getelementptr inbounds float* %tmp24618, i64 1
+  %tmp24620 = getelementptr inbounds float* %tmp24619, i64 1
+  %tmp24621 = getelementptr inbounds float* %tmp24620, i64 1
+  %tmp24622 = getelementptr inbounds float* %tmp24621, i64 1
+  %tmp24623 = getelementptr inbounds float* %tmp24622, i64 1
+  %tmp24624 = getelementptr inbounds float* %tmp24623, i64 1
+  %tmp24625 = getelementptr inbounds float* %tmp24624, i64 1
+  %tmp24626 = getelementptr inbounds float* %tmp24625, i64 1
+  %tmp24627 = getelementptr inbounds float* %tmp24626, i64 1
+  %tmp24628 = getelementptr inbounds float* %tmp24627, i64 1
+  %tmp24629 = getelementptr inbounds float* %tmp24628, i64 1
+  %tmp24630 = getelementptr inbounds float* %tmp24629, i64 1
+  %tmp24631 = getelementptr inbounds float* %tmp24630, i64 1
+  %tmp24632 = getelementptr inbounds float* %tmp24631, i64 1
+  %tmp24633 = getelementptr inbounds float* %tmp24632, i64 1
+  %tmp24634 = getelementptr inbounds float* %tmp24633, i64 1
+  %tmp24635 = getelementptr inbounds float* %tmp24634, i64 1
+  %tmp24636 = getelementptr inbounds float* %tmp24635, i64 1
+  %tmp24637 = getelementptr inbounds float* %tmp24636, i64 1
+  %tmp24638 = getelementptr inbounds float* %tmp24637, i64 1
+  %tmp24639 = getelementptr inbounds float* %tmp24638, i64 1
+  %tmp24640 = getelementptr inbounds float* %tmp24639, i64 1
+  %tmp24641 = getelementptr inbounds float* %tmp24640, i64 1
+  %tmp24642 = getelementptr inbounds float* %tmp24641, i64 1
+  %tmp24643 = getelementptr inbounds float* %tmp24642, i64 1
+  %tmp24644 = getelementptr inbounds float* %tmp24643, i64 1
+  %tmp24645 = getelementptr inbounds float* %tmp24644, i64 1
+  %tmp24646 = getelementptr inbounds float* %tmp24645, i64 1
+  %tmp24647 = getelementptr inbounds float* %tmp24646, i64 1
+  %tmp24648 = getelementptr inbounds float* %tmp24647, i64 1
+  %tmp24649 = getelementptr inbounds float* %tmp24648, i64 1
+  %tmp24650 = getelementptr inbounds float* %tmp24649, i64 1
+  %tmp24651 = getelementptr inbounds float* %tmp24650, i64 1
+  %tmp24652 = getelementptr inbounds float* %tmp24651, i64 1
+  %tmp24653 = getelementptr inbounds float* %tmp24652, i64 1
+  %tmp24654 = getelementptr inbounds float* %tmp24653, i64 1
+  %tmp24655 = getelementptr inbounds float* %tmp24654, i64 1
+  %tmp24656 = getelementptr inbounds float* %tmp24655, i64 1
+  %tmp24657 = getelementptr inbounds float* %tmp24656, i64 1
+  %tmp24658 = getelementptr inbounds float* %tmp24657, i64 1
+  %tmp24659 = getelementptr inbounds float* %tmp24658, i64 1
+  %tmp24660 = getelementptr inbounds float* %tmp24659, i64 1
+  %tmp24661 = getelementptr inbounds float* %tmp24660, i64 1
+  %tmp24662 = getelementptr inbounds float* %tmp24661, i64 1
+  %tmp24663 = getelementptr inbounds float* %tmp24662, i64 1
+  %tmp24664 = getelementptr inbounds float* %tmp24663, i64 1
+  %tmp24665 = getelementptr inbounds float* %tmp24664, i64 1
+  %tmp24666 = getelementptr inbounds float* %tmp24665, i64 1
+  %tmp24667 = getelementptr inbounds float* %tmp24666, i64 1
+  %tmp24668 = getelementptr inbounds float* %tmp24667, i64 1
+  %tmp24669 = getelementptr inbounds float* %tmp24668, i64 1
+  %tmp24670 = getelementptr inbounds float* %tmp24669, i64 1
+  %tmp24671 = getelementptr inbounds float* %tmp24670, i64 1
+  %tmp24672 = getelementptr inbounds float* %tmp24671, i64 1
+  %tmp24673 = getelementptr inbounds float* %tmp24672, i64 1
+  %tmp24674 = getelementptr inbounds float* %tmp24673, i64 1
+  %tmp24675 = getelementptr inbounds float* %tmp24674, i64 1
+  %tmp24676 = getelementptr inbounds float* %tmp24675, i64 1
+  %tmp24677 = getelementptr inbounds float* %tmp24676, i64 1
+  %tmp24678 = getelementptr inbounds float* %tmp24677, i64 1
+  %tmp24679 = getelementptr inbounds float* %tmp24678, i64 1
+  %tmp24680 = getelementptr inbounds float* %tmp24679, i64 1
+  %tmp24681 = getelementptr inbounds float* %tmp24680, i64 1
+  %tmp24682 = getelementptr inbounds float* %tmp24681, i64 1
+  %tmp24683 = getelementptr inbounds float* %tmp24682, i64 1
+  %tmp24684 = getelementptr inbounds float* %tmp24683, i64 1
+  %tmp24685 = getelementptr inbounds float* %tmp24684, i64 1
+  %tmp24686 = getelementptr inbounds float* %tmp24685, i64 1
+  %tmp24687 = getelementptr inbounds float* %tmp24686, i64 1
+  %tmp24688 = getelementptr inbounds float* %tmp24687, i64 1
+  %tmp24689 = getelementptr inbounds float* %tmp24688, i64 1
+  %tmp24690 = getelementptr inbounds float* %tmp24689, i64 1
+  %tmp24691 = getelementptr inbounds float* %tmp24690, i64 1
+  %tmp24692 = getelementptr inbounds float* %tmp24691, i64 1
+  %tmp24693 = getelementptr inbounds float* %tmp24692, i64 1
+  %tmp24694 = getelementptr inbounds float* %tmp24693, i64 1
+  %tmp24695 = getelementptr inbounds float* %tmp24694, i64 1
+  %tmp24696 = getelementptr inbounds float* %tmp24695, i64 1
+  %tmp24697 = getelementptr inbounds float* %tmp24696, i64 1
+  %tmp24698 = getelementptr inbounds float* %tmp24697, i64 1
+  %tmp24699 = getelementptr inbounds float* %tmp24698, i64 1
+  %tmp24700 = getelementptr inbounds float* %tmp24699, i64 1
+  %tmp24701 = getelementptr inbounds float* %tmp24700, i64 1
+  %tmp24702 = getelementptr inbounds float* %tmp24701, i64 1
+  %tmp24703 = getelementptr inbounds float* %tmp24702, i64 1
+  %tmp24704 = getelementptr inbounds float* %tmp24703, i64 1
+  %tmp24705 = getelementptr inbounds float* %tmp24704, i64 1
+  %tmp24706 = getelementptr inbounds float* %tmp24705, i64 1
+  %tmp24707 = getelementptr inbounds float* %tmp24706, i64 1
+  %tmp24708 = getelementptr inbounds float* %tmp24707, i64 1
+  %tmp24709 = getelementptr inbounds float* %tmp24708, i64 1
+  %tmp24710 = getelementptr inbounds float* %tmp24709, i64 1
+  %tmp24711 = getelementptr inbounds float* %tmp24710, i64 1
+  %tmp24712 = getelementptr inbounds float* %tmp24711, i64 1
+  %tmp24713 = getelementptr inbounds float* %tmp24712, i64 1
+  %tmp24714 = getelementptr inbounds float* %tmp24713, i64 1
+  %tmp24715 = getelementptr inbounds float* %tmp24714, i64 1
+  %tmp24716 = getelementptr inbounds float* %tmp24715, i64 1
+  %tmp24717 = getelementptr inbounds float* %tmp24716, i64 1
+  %tmp24718 = getelementptr inbounds float* %tmp24717, i64 1
+  %tmp24719 = getelementptr inbounds float* %tmp24718, i64 1
+  %tmp24720 = getelementptr inbounds float* %tmp24719, i64 1
+  %tmp24721 = getelementptr inbounds float* %tmp24720, i64 1
+  %tmp24722 = getelementptr inbounds float* %tmp24721, i64 1
+  %tmp24723 = getelementptr inbounds float* %tmp24722, i64 1
+  %tmp24724 = getelementptr inbounds float* %tmp24723, i64 1
+  %tmp24725 = getelementptr inbounds float* %tmp24724, i64 1
+  %tmp24726 = getelementptr inbounds float* %tmp24725, i64 1
+  %tmp24727 = getelementptr inbounds float* %tmp24726, i64 1
+  %tmp24728 = getelementptr inbounds float* %tmp24727, i64 1
+  %tmp24729 = getelementptr inbounds float* %tmp24728, i64 1
+  %tmp24730 = getelementptr inbounds float* %tmp24729, i64 1
+  %tmp24731 = getelementptr inbounds float* %tmp24730, i64 1
+  %tmp24732 = getelementptr inbounds float* %tmp24731, i64 1
+  %tmp24733 = getelementptr inbounds float* %tmp24732, i64 1
+  %tmp24734 = getelementptr inbounds float* %tmp24733, i64 1
+  %tmp24735 = getelementptr inbounds float* %tmp24734, i64 1
+  %tmp24736 = getelementptr inbounds float* %tmp24735, i64 1
+  %tmp24737 = getelementptr inbounds float* %tmp24736, i64 1
+  %tmp24738 = getelementptr inbounds float* %tmp24737, i64 1
+  %tmp24739 = getelementptr inbounds float* %tmp24738, i64 1
+  %tmp24740 = getelementptr inbounds float* %tmp24739, i64 1
+  %tmp24741 = getelementptr inbounds float* %tmp24740, i64 1
+  %tmp24742 = getelementptr inbounds float* %tmp24741, i64 1
+  %tmp24743 = getelementptr inbounds float* %tmp24742, i64 1
+  %tmp24744 = getelementptr inbounds float* %tmp24743, i64 1
+  %tmp24745 = getelementptr inbounds float* %tmp24744, i64 1
+  %tmp24746 = getelementptr inbounds float* %tmp24745, i64 1
+  %tmp24747 = getelementptr inbounds float* %tmp24746, i64 1
+  %tmp24748 = getelementptr inbounds float* %tmp24747, i64 1
+  %tmp24749 = getelementptr inbounds float* %tmp24748, i64 1
+  %tmp24750 = getelementptr inbounds float* %tmp24749, i64 1
+  %tmp24751 = getelementptr inbounds float* %tmp24750, i64 1
+  %tmp24752 = getelementptr inbounds float* %tmp24751, i64 1
+  %tmp24753 = getelementptr inbounds float* %tmp24752, i64 1
+  %tmp24754 = getelementptr inbounds float* %tmp24753, i64 1
+  %tmp24755 = getelementptr inbounds float* %tmp24754, i64 1
+  %tmp24756 = getelementptr inbounds float* %tmp24755, i64 1
+  %tmp24757 = getelementptr inbounds float* %tmp24756, i64 1
+  %tmp24758 = getelementptr inbounds float* %tmp24757, i64 1
+  %tmp24759 = getelementptr inbounds float* %tmp24758, i64 1
+  %tmp24760 = getelementptr inbounds float* %tmp24759, i64 1
+  %tmp24761 = getelementptr inbounds float* %tmp24760, i64 1
+  %tmp24762 = getelementptr inbounds float* %tmp24761, i64 1
+  %tmp24763 = getelementptr inbounds float* %tmp24762, i64 1
+  %tmp24764 = getelementptr inbounds float* %tmp24763, i64 1
+  %tmp24765 = getelementptr inbounds float* %tmp24764, i64 1
+  %tmp24766 = getelementptr inbounds float* %tmp24765, i64 1
+  %tmp24767 = getelementptr inbounds float* %tmp24766, i64 1
+  %tmp24768 = getelementptr inbounds float* %tmp24767, i64 1
+  %tmp24769 = getelementptr inbounds float* %tmp24768, i64 1
+  %tmp24770 = getelementptr inbounds float* %tmp24769, i64 1
+  %tmp24771 = getelementptr inbounds float* %tmp24770, i64 1
+  %tmp24772 = getelementptr inbounds float* %tmp24771, i64 1
+  %tmp24773 = getelementptr inbounds float* %tmp24772, i64 1
+  %tmp24774 = getelementptr inbounds float* %tmp24773, i64 1
+  %tmp24775 = getelementptr inbounds float* %tmp24774, i64 1
+  %tmp24776 = getelementptr inbounds float* %tmp24775, i64 1
+  %tmp24777 = getelementptr inbounds float* %tmp24776, i64 1
+  %tmp24778 = getelementptr inbounds float* %tmp24777, i64 1
+  %tmp24779 = getelementptr inbounds float* %tmp24778, i64 1
+  %tmp24780 = getelementptr inbounds float* %tmp24779, i64 1
+  %tmp24781 = getelementptr inbounds float* %tmp24780, i64 1
+  %tmp24782 = getelementptr inbounds float* %tmp24781, i64 1
+  %tmp24783 = getelementptr inbounds float* %tmp24782, i64 1
+  %tmp24784 = getelementptr inbounds float* %tmp24783, i64 1
+  %tmp24785 = getelementptr inbounds float* %tmp24784, i64 1
+  %tmp24786 = getelementptr inbounds float* %tmp24785, i64 1
+  %tmp24787 = getelementptr inbounds float* %tmp24786, i64 1
+  %tmp24788 = getelementptr inbounds float* %tmp24787, i64 1
+  %tmp24789 = getelementptr inbounds float* %tmp24788, i64 1
+  %tmp24790 = getelementptr inbounds float* %tmp24789, i64 1
+  %tmp24791 = getelementptr inbounds float* %tmp24790, i64 1
+  %tmp24792 = getelementptr inbounds float* %tmp24791, i64 1
+  %tmp24793 = getelementptr inbounds float* %tmp24792, i64 1
+  %tmp24794 = getelementptr inbounds float* %tmp24793, i64 1
+  %tmp24795 = getelementptr inbounds float* %tmp24794, i64 1
+  %tmp24796 = getelementptr inbounds float* %tmp24795, i64 1
+  %tmp24797 = getelementptr inbounds float* %tmp24796, i64 1
+  %tmp24798 = getelementptr inbounds float* %tmp24797, i64 1
+  %tmp24799 = getelementptr inbounds float* %tmp24798, i64 1
+  %tmp24800 = getelementptr inbounds float* %tmp24799, i64 1
+  %tmp24801 = getelementptr inbounds float* %tmp24800, i64 1
+  %tmp24802 = getelementptr inbounds float* %tmp24801, i64 1
+  %tmp24803 = getelementptr inbounds float* %tmp24802, i64 1
+  %tmp24804 = getelementptr inbounds float* %tmp24803, i64 1
+  %tmp24805 = getelementptr inbounds float* %tmp24804, i64 1
+  %tmp24806 = getelementptr inbounds float* %tmp24805, i64 1
+  %tmp24807 = getelementptr inbounds float* %tmp24806, i64 1
+  %tmp24808 = getelementptr inbounds float* %tmp24807, i64 1
+  %tmp24809 = getelementptr inbounds float* %tmp24808, i64 1
+  %tmp24810 = getelementptr inbounds float* %tmp24809, i64 1
+  %tmp24811 = getelementptr inbounds float* %tmp24810, i64 1
+  %tmp24812 = getelementptr inbounds float* %tmp24811, i64 1
+  %tmp24813 = getelementptr inbounds float* %tmp24812, i64 1
+  %tmp24814 = getelementptr inbounds float* %tmp24813, i64 1
+  %tmp24815 = getelementptr inbounds float* %tmp24814, i64 1
+  %tmp24816 = getelementptr inbounds float* %tmp24815, i64 1
+  %tmp24817 = getelementptr inbounds float* %tmp24816, i64 1
+  %tmp24818 = getelementptr inbounds float* %tmp24817, i64 1
+  %tmp24819 = getelementptr inbounds float* %tmp24818, i64 1
+  %tmp24820 = getelementptr inbounds float* %tmp24819, i64 1
+  %tmp24821 = getelementptr inbounds float* %tmp24820, i64 1
+  %tmp24822 = getelementptr inbounds float* %tmp24821, i64 1
+  %tmp24823 = getelementptr inbounds float* %tmp24822, i64 1
+  %tmp24824 = getelementptr inbounds float* %tmp24823, i64 1
+  %tmp24825 = getelementptr inbounds float* %tmp24824, i64 1
+  %tmp24826 = getelementptr inbounds float* %tmp24825, i64 1
+  %tmp24827 = getelementptr inbounds float* %tmp24826, i64 1
+  %tmp24828 = getelementptr inbounds float* %tmp24827, i64 1
+  %tmp24829 = getelementptr inbounds float* %tmp24828, i64 1
+  %tmp24830 = getelementptr inbounds float* %tmp24829, i64 1
+  %tmp24831 = getelementptr inbounds float* %tmp24830, i64 1
+  %tmp24832 = getelementptr inbounds float* %tmp24831, i64 1
+  %tmp24833 = getelementptr inbounds float* %tmp24832, i64 1
+  %tmp24834 = getelementptr inbounds float* %tmp24833, i64 1
+  %tmp24835 = getelementptr inbounds float* %tmp24834, i64 1
+  %tmp24836 = getelementptr inbounds float* %tmp24835, i64 1
+  %tmp24837 = getelementptr inbounds float* %tmp24836, i64 1
+  %tmp24838 = getelementptr inbounds float* %tmp24837, i64 1
+  %tmp24839 = getelementptr inbounds float* %tmp24838, i64 1
+  %tmp24840 = getelementptr inbounds float* %tmp24839, i64 1
+  %tmp24841 = getelementptr inbounds float* %tmp24840, i64 1
+  %tmp24842 = getelementptr inbounds float* %tmp24841, i64 1
+  %tmp24843 = getelementptr inbounds float* %tmp24842, i64 1
+  %tmp24844 = getelementptr inbounds float* %tmp24843, i64 1
+  %tmp24845 = getelementptr inbounds float* %tmp24844, i64 1
+  %tmp24846 = getelementptr inbounds float* %tmp24845, i64 1
+  %tmp24847 = getelementptr inbounds float* %tmp24846, i64 1
+  %tmp24848 = getelementptr inbounds float* %tmp24847, i64 1
+  %tmp24849 = getelementptr inbounds float* %tmp24848, i64 1
+  %tmp24850 = getelementptr inbounds float* %tmp24849, i64 1
+  %tmp24851 = getelementptr inbounds float* %tmp24850, i64 1
+  %tmp24852 = getelementptr inbounds float* %tmp24851, i64 1
+  %tmp24853 = getelementptr inbounds float* %tmp24852, i64 1
+  %tmp24854 = getelementptr inbounds float* %tmp24853, i64 1
+  %tmp24855 = getelementptr inbounds float* %tmp24854, i64 1
+  %tmp24856 = getelementptr inbounds float* %tmp24855, i64 1
+  %tmp24857 = getelementptr inbounds float* %tmp24856, i64 1
+  %tmp24858 = getelementptr inbounds float* %tmp24857, i64 1
+  %tmp24859 = getelementptr inbounds float* %tmp24858, i64 1
+  %tmp24860 = getelementptr inbounds float* %tmp24859, i64 1
+  %tmp24861 = getelementptr inbounds float* %tmp24860, i64 1
+  %tmp24862 = getelementptr inbounds float* %tmp24861, i64 1
+  %tmp24863 = getelementptr inbounds float* %tmp24862, i64 1
+  %tmp24864 = getelementptr inbounds float* %tmp24863, i64 1
+  %tmp24865 = getelementptr inbounds float* %tmp24864, i64 1
+  %tmp24866 = getelementptr inbounds float* %tmp24865, i64 1
+  %tmp24867 = getelementptr inbounds float* %tmp24866, i64 1
+  %tmp24868 = getelementptr inbounds float* %tmp24867, i64 1
+  %tmp24869 = getelementptr inbounds float* %tmp24868, i64 1
+  %tmp24870 = getelementptr inbounds float* %tmp24869, i64 1
+  %tmp24871 = getelementptr inbounds float* %tmp24870, i64 1
+  %tmp24872 = getelementptr inbounds float* %tmp24871, i64 1
+  %tmp24873 = getelementptr inbounds float* %tmp24872, i64 1
+  %tmp24874 = getelementptr inbounds float* %tmp24873, i64 1
+  %tmp24875 = getelementptr inbounds float* %tmp24874, i64 1
+  %tmp24876 = getelementptr inbounds float* %tmp24875, i64 1
+  %tmp24877 = getelementptr inbounds float* %tmp24876, i64 1
+  %tmp24878 = getelementptr inbounds float* %tmp24877, i64 1
+  %tmp24879 = getelementptr inbounds float* %tmp24878, i64 1
+  %tmp24880 = getelementptr inbounds float* %tmp24879, i64 1
+  %tmp24881 = getelementptr inbounds float* %tmp24880, i64 1
+  %tmp24882 = getelementptr inbounds float* %tmp24881, i64 1
+  %tmp24883 = getelementptr inbounds float* %tmp24882, i64 1
+  %tmp24884 = getelementptr inbounds float* %tmp24883, i64 1
+  %tmp24885 = getelementptr inbounds float* %tmp24884, i64 1
+  %tmp24886 = getelementptr inbounds float* %tmp24885, i64 1
+  %tmp24887 = getelementptr inbounds float* %tmp24886, i64 1
+  %tmp24888 = getelementptr inbounds float* %tmp24887, i64 1
+  %tmp24889 = getelementptr inbounds float* %tmp24888, i64 1
+  %tmp24890 = getelementptr inbounds float* %tmp24889, i64 1
+  %tmp24891 = getelementptr inbounds float* %tmp24890, i64 1
+  %tmp24892 = getelementptr inbounds float* %tmp24891, i64 1
+  %tmp24893 = getelementptr inbounds float* %tmp24892, i64 1
+  %tmp24894 = getelementptr inbounds float* %tmp24893, i64 1
+  %tmp24895 = getelementptr inbounds float* %tmp24894, i64 1
+  %tmp24896 = getelementptr inbounds float* %tmp24895, i64 1
+  %tmp24897 = getelementptr inbounds float* %tmp24896, i64 1
+  %tmp24898 = getelementptr inbounds float* %tmp24897, i64 1
+  %tmp24899 = getelementptr inbounds float* %tmp24898, i64 1
+  %tmp24900 = getelementptr inbounds float* %tmp24899, i64 1
+  %tmp24901 = getelementptr inbounds float* %tmp24900, i64 1
+  %tmp24902 = getelementptr inbounds float* %tmp24901, i64 1
+  %tmp24903 = getelementptr inbounds float* %tmp24902, i64 1
+  %tmp24904 = getelementptr inbounds float* %tmp24903, i64 1
+  %tmp24905 = getelementptr inbounds float* %tmp24904, i64 1
+  %tmp24906 = getelementptr inbounds float* %tmp24905, i64 1
+  %tmp24907 = getelementptr inbounds float* %tmp24906, i64 1
+  %tmp24908 = getelementptr inbounds float* %tmp24907, i64 1
+  %tmp24909 = getelementptr inbounds float* %tmp24908, i64 1
+  %tmp24910 = getelementptr inbounds float* %tmp24909, i64 1
+  %tmp24911 = getelementptr inbounds float* %tmp24910, i64 1
+  %tmp24912 = getelementptr inbounds float* %tmp24911, i64 1
+  %tmp24913 = getelementptr inbounds float* %tmp24912, i64 1
+  %tmp24914 = getelementptr inbounds float* %tmp24913, i64 1
+  %tmp24915 = getelementptr inbounds float* %tmp24914, i64 1
+  %tmp24916 = getelementptr inbounds float* %tmp24915, i64 1
+  %tmp24917 = getelementptr inbounds float* %tmp24916, i64 1
+  %tmp24918 = getelementptr inbounds float* %tmp24917, i64 1
+  %tmp24919 = getelementptr inbounds float* %tmp24918, i64 1
+  %tmp24920 = getelementptr inbounds float* %tmp24919, i64 1
+  %tmp24921 = getelementptr inbounds float* %tmp24920, i64 1
+  %tmp24922 = getelementptr inbounds float* %tmp24921, i64 1
+  %tmp24923 = getelementptr inbounds float* %tmp24922, i64 1
+  %tmp24924 = getelementptr inbounds float* %tmp24923, i64 1
+  %tmp24925 = getelementptr inbounds float* %tmp24924, i64 1
+  %tmp24926 = getelementptr inbounds float* %tmp24925, i64 1
+  %tmp24927 = getelementptr inbounds float* %tmp24926, i64 1
+  %tmp24928 = getelementptr inbounds float* %tmp24927, i64 1
+  %tmp24929 = getelementptr inbounds float* %tmp24928, i64 1
+  %tmp24930 = getelementptr inbounds float* %tmp24929, i64 1
+  %tmp24931 = getelementptr inbounds float* %tmp24930, i64 1
+  %tmp24932 = getelementptr inbounds float* %tmp24931, i64 1
+  %tmp24933 = getelementptr inbounds float* %tmp24932, i64 1
+  %tmp24934 = getelementptr inbounds float* %tmp24933, i64 1
+  %tmp24935 = getelementptr inbounds float* %tmp24934, i64 1
+  %tmp24936 = getelementptr inbounds float* %tmp24935, i64 1
+  %tmp24937 = getelementptr inbounds float* %tmp24936, i64 1
+  %tmp24938 = getelementptr inbounds float* %tmp24937, i64 1
+  %tmp24939 = getelementptr inbounds float* %tmp24938, i64 1
+  %tmp24940 = getelementptr inbounds float* %tmp24939, i64 1
+  %tmp24941 = getelementptr inbounds float* %tmp24940, i64 1
+  %tmp24942 = getelementptr inbounds float* %tmp24941, i64 1
+  %tmp24943 = getelementptr inbounds float* %tmp24942, i64 1
+  %tmp24944 = getelementptr inbounds float* %tmp24943, i64 1
+  %tmp24945 = getelementptr inbounds float* %tmp24944, i64 1
+  %tmp24946 = getelementptr inbounds float* %tmp24945, i64 1
+  store float 0x3F43FD0D00000000, float* %tmp24946
+  %tmp24947 = getelementptr inbounds float* undef, i64 1
+  %tmp24948 = getelementptr inbounds float* undef, i64 1
+  %tmp24949 = getelementptr inbounds float* undef, i64 1
+  %tmp24950 = getelementptr inbounds float* undef, i64 1
+  %tmp24951 = getelementptr inbounds float* %tmp24950, i64 1
+  %tmp24952 = getelementptr inbounds float* undef, i64 1
+  %tmp24953 = getelementptr inbounds float* undef, i64 1
+  %tmp24954 = getelementptr inbounds float* undef, i64 1
+  %tmp24955 = getelementptr inbounds float* undef, i64 1
+  %tmp24956 = getelementptr inbounds float* undef, i64 1
+  %tmp24957 = getelementptr inbounds float* undef, i64 1
+  %tmp24958 = getelementptr inbounds float* %tmp24957, i64 1
+  %tmp24959 = getelementptr inbounds float* undef, i64 1
+  %tmp24960 = getelementptr inbounds float* undef, i64 1
+  %tmp24961 = getelementptr inbounds float* undef, i64 1
+  %tmp24962 = getelementptr inbounds float* undef, i64 1
+  %tmp24963 = getelementptr inbounds float* undef, i64 1
+  %tmp24964 = getelementptr inbounds float* undef, i64 1
+  %tmp24965 = getelementptr inbounds float* undef, i64 1
+  %tmp24966 = getelementptr inbounds float* %tmp24965, i64 1
+  %tmp24967 = getelementptr inbounds float* undef, i64 1
+  %tmp24968 = getelementptr inbounds float* undef, i64 1
+  %tmp24969 = getelementptr inbounds float* undef, i64 1
+  %tmp24970 = getelementptr inbounds float* undef, i64 1
+  %tmp24971 = getelementptr inbounds float* %tmp24970, i64 1
+  %tmp24972 = getelementptr inbounds float* %tmp24971, i64 1
+  %tmp24973 = getelementptr inbounds float* %tmp24972, i64 1
+  %tmp24974 = getelementptr inbounds float* undef, i64 1
+  %tmp24975 = getelementptr inbounds float* undef, i64 1
+  %tmp24976 = getelementptr inbounds float* %tmp24975, i64 1
+  %tmp24977 = getelementptr inbounds float* undef, i64 1
+  %tmp24978 = getelementptr inbounds float* undef, i64 1
+  %tmp24979 = getelementptr inbounds float* undef, i64 1
+  %tmp24980 = getelementptr inbounds float* undef, i64 1
+  %tmp24981 = getelementptr inbounds float* undef, i64 1
+  %tmp24982 = getelementptr inbounds float* undef, i64 1
+  %tmp24983 = getelementptr inbounds float* %tmp24982, i64 1
+  %tmp24984 = getelementptr inbounds float* undef, i64 1
+  %tmp24985 = getelementptr inbounds float* %tmp24984, i64 1
+  %tmp24986 = getelementptr inbounds float* undef, i64 1
+  %tmp24987 = getelementptr inbounds float* %tmp24986, i64 1
+  %tmp24988 = getelementptr inbounds float* %tmp24987, i64 1
+  %tmp24989 = getelementptr inbounds float* undef, i64 1
+  %tmp24990 = getelementptr inbounds float* undef, i64 1
+  %tmp24991 = getelementptr inbounds float* %tmp24990, i64 1
+  %tmp24992 = getelementptr inbounds float* undef, i64 1
+  %tmp24993 = getelementptr inbounds float* %tmp24992, i64 1
+  %tmp24994 = getelementptr inbounds float* %tmp24993, i64 1
+  %tmp24995 = getelementptr inbounds float* undef, i64 1
+  %tmp24996 = getelementptr inbounds float* undef, i64 1
+  %tmp24997 = getelementptr inbounds float* undef, i64 1
+  %tmp24998 = getelementptr inbounds float* undef, i64 1
+  %tmp24999 = getelementptr inbounds float* undef, i64 1
+  %tmp25000 = getelementptr inbounds float* undef, i64 1
+  %tmp25001 = getelementptr inbounds float* undef, i64 1
+  %tmp25002 = getelementptr inbounds float* undef, i64 1
+  %tmp25003 = getelementptr inbounds float* undef, i64 1
+  %tmp25004 = getelementptr inbounds float* undef, i64 1
+  %tmp25005 = getelementptr inbounds float* undef, i64 1
+  %tmp25006 = getelementptr inbounds float* undef, i64 1
+  %tmp25007 = getelementptr inbounds float* undef, i64 1
+  %tmp25008 = getelementptr inbounds float* undef, i64 1
+  %tmp25009 = getelementptr inbounds float* undef, i64 1
+  %tmp25010 = getelementptr inbounds float* undef, i64 1
+  %tmp25011 = getelementptr inbounds float* undef, i64 1
+  %tmp25012 = getelementptr inbounds float* %tmp25011, i64 1
+  %tmp25013 = getelementptr inbounds float* undef, i64 1
+  %tmp25014 = getelementptr inbounds float* undef, i64 1
+  %tmp25015 = getelementptr inbounds float* undef, i64 1
+  %tmp25016 = getelementptr inbounds float* undef, i64 1
+  %tmp25017 = getelementptr inbounds float* %tmp25016, i64 1
+  %tmp25018 = getelementptr inbounds float* undef, i64 1
+  %tmp25019 = getelementptr inbounds float* undef, i64 1
+  %tmp25020 = getelementptr inbounds float* undef, i64 1
+  %tmp25021 = getelementptr inbounds float* undef, i64 1
+  %tmp25022 = getelementptr inbounds float* undef, i64 1
+  %tmp25023 = getelementptr inbounds float* %tmp25022, i64 1
+  %tmp25024 = getelementptr inbounds float* %tmp25023, i64 1
+  %tmp25025 = getelementptr inbounds float* undef, i64 1
+  %tmp25026 = getelementptr inbounds float* undef, i64 1
+  %tmp25027 = getelementptr inbounds float* undef, i64 1
+  %tmp25028 = getelementptr inbounds float* undef, i64 1
+  %tmp25029 = getelementptr inbounds float* undef, i64 1
+  %tmp25030 = getelementptr inbounds float* undef, i64 1
+  %tmp25031 = getelementptr inbounds float* undef, i64 1
+  %tmp25032 = getelementptr inbounds float* undef, i64 1
+  %tmp25033 = getelementptr inbounds float* undef, i64 1
+  %tmp25034 = getelementptr inbounds float* undef, i64 1
+  %tmp25035 = getelementptr inbounds float* %tmp25034, i64 1
+  %tmp25036 = getelementptr inbounds float* undef, i64 1
+  %tmp25037 = getelementptr inbounds float* undef, i64 1
+  %tmp25038 = getelementptr inbounds float* %tmp25037, i64 1
+  %tmp25039 = getelementptr inbounds float* undef, i64 1
+  %tmp25040 = getelementptr inbounds float* undef, i64 1
+  %tmp25041 = getelementptr inbounds float* undef, i64 1
+  %tmp25042 = getelementptr inbounds float* undef, i64 1
+  %tmp25043 = getelementptr inbounds float* undef, i64 1
+  %tmp25044 = getelementptr inbounds float* undef, i64 1
+  %tmp25045 = getelementptr inbounds float* %tmp25044, i64 1
+  %tmp25046 = getelementptr inbounds float* undef, i64 1
+  %tmp25047 = getelementptr inbounds float* %tmp25046, i64 1
+  %tmp25048 = getelementptr inbounds float* undef, i64 1
+  %tmp25049 = getelementptr inbounds float* %tmp25048, i64 1
+  %tmp25050 = getelementptr inbounds float* %tmp25049, i64 1
+  %tmp25051 = getelementptr inbounds float* undef, i64 1
+  %tmp25052 = getelementptr inbounds float* undef, i64 1
+  %tmp25053 = getelementptr inbounds float* undef, i64 1
+  %tmp25054 = getelementptr inbounds float* undef, i64 1
+  %tmp25055 = getelementptr inbounds float* undef, i64 1
+  %tmp25056 = getelementptr inbounds float* undef, i64 1
+  %tmp25057 = getelementptr inbounds float* undef, i64 1
+  %tmp25058 = getelementptr inbounds float* undef, i64 1
+  %tmp25059 = getelementptr inbounds float* undef, i64 1
+  %tmp25060 = getelementptr inbounds float* undef, i64 1
+  %tmp25061 = getelementptr inbounds float* undef, i64 1
+  %tmp25062 = getelementptr inbounds float* undef, i64 1
+  %tmp25063 = getelementptr inbounds float* undef, i64 1
+  %tmp25064 = getelementptr inbounds float* undef, i64 1
+  %tmp25065 = getelementptr inbounds float* undef, i64 1
+  %tmp25066 = getelementptr inbounds float* undef, i64 1
+  %tmp25067 = getelementptr inbounds float* %tmp25066, i64 1
+  %tmp25068 = getelementptr inbounds float* undef, i64 1
+  %tmp25069 = getelementptr inbounds float* %tmp25068, i64 1
+  %tmp25070 = getelementptr inbounds float* undef, i64 1
+  %tmp25071 = getelementptr inbounds float* undef, i64 1
+  %tmp25072 = getelementptr inbounds float* undef, i64 1
+  %tmp25073 = getelementptr inbounds float* undef, i64 1
+  %tmp25074 = getelementptr inbounds float* undef, i64 1
+  %tmp25075 = getelementptr inbounds float* %tmp25074, i64 1
+  %tmp25076 = getelementptr inbounds float* undef, i64 1
+  %tmp25077 = getelementptr inbounds float* undef, i64 1
+  %tmp25078 = getelementptr inbounds float* undef, i64 1
+  %tmp25079 = getelementptr inbounds float* undef, i64 1
+  %tmp25080 = getelementptr inbounds float* undef, i64 1
+  %tmp25081 = getelementptr inbounds float* undef, i64 1
+  %tmp25082 = getelementptr inbounds float* undef, i64 1
+  %tmp25083 = getelementptr inbounds float* undef, i64 1
+  %tmp25084 = getelementptr inbounds float* undef, i64 1
+  %tmp25085 = getelementptr inbounds float* undef, i64 1
+  %tmp25086 = getelementptr inbounds float* undef, i64 1
+  %tmp25087 = getelementptr inbounds float* undef, i64 1
+  %tmp25088 = getelementptr inbounds float* undef, i64 1
+  %tmp25089 = getelementptr inbounds float* undef, i64 1
+  %tmp25090 = getelementptr inbounds float* undef, i64 1
+  %tmp25091 = getelementptr inbounds float* undef, i64 1
+  %tmp25092 = getelementptr inbounds float* undef, i64 1
+  %tmp25093 = getelementptr inbounds float* undef, i64 1
+  %tmp25094 = getelementptr inbounds float* undef, i64 1
+  %tmp25095 = getelementptr inbounds float* %tmp25094, i64 1
+  %tmp25096 = getelementptr inbounds float* undef, i64 1
+  %tmp25097 = getelementptr inbounds float* %tmp25096, i64 1
+  %tmp25098 = getelementptr inbounds float* %tmp25097, i64 1
+  %tmp25099 = getelementptr inbounds float* undef, i64 1
+  %tmp25100 = getelementptr inbounds float* undef, i64 1
+  %tmp25101 = getelementptr inbounds float* undef, i64 1
+  %tmp25102 = getelementptr inbounds float* undef, i64 1
+  %tmp25103 = getelementptr inbounds float* undef, i64 1
+  %tmp25104 = getelementptr inbounds float* undef, i64 1
+  %tmp25105 = getelementptr inbounds float* undef, i64 1
+  %tmp25106 = getelementptr inbounds float* undef, i64 1
+  %tmp25107 = getelementptr inbounds float* %tmp25106, i64 1
+  %tmp25108 = getelementptr inbounds float* undef, i64 1
+  %tmp25109 = getelementptr inbounds float* undef, i64 1
+  %tmp25110 = getelementptr inbounds float* undef, i64 1
+  %tmp25111 = getelementptr inbounds float* undef, i64 1
+  %tmp25112 = getelementptr inbounds float* undef, i64 1
+  %tmp25113 = getelementptr inbounds float* undef, i64 1
+  %tmp25114 = getelementptr inbounds float* undef, i64 1
+  %tmp25115 = getelementptr inbounds float* undef, i64 1
+  %tmp25116 = getelementptr inbounds float* undef, i64 1
+  %tmp25117 = getelementptr inbounds float* undef, i64 1
+  %tmp25118 = getelementptr inbounds float* undef, i64 1
+  %tmp25119 = getelementptr inbounds float* undef, i64 1
+  %tmp25120 = getelementptr inbounds float* undef, i64 1
+  %tmp25121 = getelementptr inbounds float* undef, i64 1
+  %tmp25122 = getelementptr inbounds float* %tmp25121, i64 1
+  %tmp25123 = getelementptr inbounds float* undef, i64 1
+  %tmp25124 = getelementptr inbounds float* undef, i64 1
+  %tmp25125 = getelementptr inbounds float* undef, i64 1
+  %tmp25126 = getelementptr inbounds float* undef, i64 1
+  %tmp25127 = getelementptr inbounds float* undef, i64 1
+  %tmp25128 = getelementptr inbounds float* undef, i64 1
+  %tmp25129 = getelementptr inbounds float* undef, i64 1
+  %tmp25130 = getelementptr inbounds float* undef, i64 1
+  %tmp25131 = getelementptr inbounds float* undef, i64 1
+  %tmp25132 = getelementptr inbounds float* undef, i64 1
+  %tmp25133 = getelementptr inbounds float* undef, i64 1
+  %tmp25134 = getelementptr inbounds float* undef, i64 1
+  %tmp25135 = getelementptr inbounds float* undef, i64 1
+  %tmp25136 = getelementptr inbounds float* undef, i64 1
+  %tmp25137 = getelementptr inbounds float* undef, i64 1
+  %tmp25138 = getelementptr inbounds float* undef, i64 1
+  %tmp25139 = getelementptr inbounds float* undef, i64 1
+  %tmp25140 = getelementptr inbounds float* undef, i64 1
+  %tmp25141 = getelementptr inbounds float* undef, i64 1
+  %tmp25142 = getelementptr inbounds float* undef, i64 1
+  %tmp25143 = getelementptr inbounds float* undef, i64 1
+  %tmp25144 = getelementptr inbounds float* undef, i64 1
+  %tmp25145 = getelementptr inbounds float* undef, i64 1
+  %tmp25146 = getelementptr inbounds float* %tmp25145, i64 1
+  %tmp25147 = getelementptr inbounds float* undef, i64 1
+  %tmp25148 = getelementptr inbounds float* %tmp25147, i64 1
+  %tmp25149 = getelementptr inbounds float* undef, i64 1
+  %tmp25150 = getelementptr inbounds float* undef, i64 1
+  %tmp25151 = getelementptr inbounds float* undef, i64 1
+  %tmp25152 = getelementptr inbounds float* undef, i64 1
+  %tmp25153 = getelementptr inbounds float* %tmp25152, i64 1
+  %tmp25154 = getelementptr inbounds float* undef, i64 1
+  %tmp25155 = getelementptr inbounds float* undef, i64 1
+  %tmp25156 = getelementptr inbounds float* undef, i64 1
+  %tmp25157 = getelementptr inbounds float* undef, i64 1
+  %tmp25158 = getelementptr inbounds float* undef, i64 1
+  %tmp25159 = getelementptr inbounds float* undef, i64 1
+  %tmp25160 = getelementptr inbounds float* undef, i64 1
+  %tmp25161 = getelementptr inbounds float* undef, i64 1
+  %tmp25162 = getelementptr inbounds float* %tmp25161, i64 1
+  %tmp25163 = getelementptr inbounds float* undef, i64 1
+  %tmp25164 = getelementptr inbounds float* undef, i64 1
+  %tmp25165 = getelementptr inbounds float* undef, i64 1
+  %tmp25166 = getelementptr inbounds float* undef, i64 1
+  %tmp25167 = getelementptr inbounds float* undef, i64 1
+  %tmp25168 = getelementptr inbounds float* undef, i64 1
+  %tmp25169 = getelementptr inbounds float* undef, i64 1
+  %tmp25170 = getelementptr inbounds float* %tmp25169, i64 1
+  %tmp25171 = getelementptr inbounds float* undef, i64 1
+  %tmp25172 = getelementptr inbounds float* undef, i64 1
+  %tmp25173 = getelementptr inbounds float* undef, i64 1
+  %tmp25174 = getelementptr inbounds float* undef, i64 1
+  %tmp25175 = getelementptr inbounds float* %tmp25174, i64 1
+  %tmp25176 = getelementptr inbounds float* undef, i64 1
+  %tmp25177 = getelementptr inbounds float* undef, i64 1
+  %tmp25178 = getelementptr inbounds float* %tmp25177, i64 1
+  %tmp25179 = getelementptr inbounds float* undef, i64 1
+  %tmp25180 = getelementptr inbounds float* undef, i64 1
+  %tmp25181 = getelementptr inbounds float* undef, i64 1
+  %tmp25182 = getelementptr inbounds float* undef, i64 1
+  %tmp25183 = getelementptr inbounds float* undef, i64 1
+  %tmp25184 = getelementptr inbounds float* undef, i64 1
+  %tmp25185 = getelementptr inbounds float* undef, i64 1
+  %tmp25186 = getelementptr inbounds float* undef, i64 1
+  %tmp25187 = getelementptr inbounds float* %tmp25186, i64 1
+  %tmp25188 = getelementptr inbounds float* %tmp25187, i64 1
+  %tmp25189 = getelementptr inbounds float* undef, i64 1
+  %tmp25190 = getelementptr inbounds float* undef, i64 1
+  %tmp25191 = getelementptr inbounds float* undef, i64 1
+  %tmp25192 = getelementptr inbounds float* %tmp25191, i64 1
+  %tmp25193 = getelementptr inbounds float* undef, i64 1
+  %tmp25194 = getelementptr inbounds float* undef, i64 1
+  %tmp25195 = getelementptr inbounds float* undef, i64 1
+  %tmp25196 = getelementptr inbounds float* undef, i64 1
+  %tmp25197 = getelementptr inbounds float* undef, i64 1
+  %tmp25198 = getelementptr inbounds float* undef, i64 1
+  %tmp25199 = getelementptr inbounds float* undef, i64 1
+  %tmp25200 = getelementptr inbounds float* undef, i64 1
+  %tmp25201 = getelementptr inbounds float* %tmp25200, i64 1
+  %tmp25202 = getelementptr inbounds float* undef, i64 1
+  %tmp25203 = getelementptr inbounds float* undef, i64 1
+  %tmp25204 = getelementptr inbounds float* undef, i64 1
+  %tmp25205 = getelementptr inbounds float* undef, i64 1
+  %tmp25206 = getelementptr inbounds float* undef, i64 1
+  %tmp25207 = getelementptr inbounds float* undef, i64 1
+  %tmp25208 = getelementptr inbounds float* undef, i64 1
+  %tmp25209 = getelementptr inbounds float* undef, i64 1
+  %tmp25210 = getelementptr inbounds float* undef, i64 1
+  %tmp25211 = getelementptr inbounds float* undef, i64 1
+  %tmp25212 = getelementptr inbounds float* undef, i64 1
+  %tmp25213 = getelementptr inbounds float* undef, i64 1
+  %tmp25214 = getelementptr inbounds float* undef, i64 1
+  %tmp25215 = getelementptr inbounds float* undef, i64 1
+  %tmp25216 = getelementptr inbounds float* undef, i64 1
+  %tmp25217 = getelementptr inbounds float* undef, i64 1
+  %tmp25218 = getelementptr inbounds float* undef, i64 1
+  %tmp25219 = getelementptr inbounds float* undef, i64 1
+  %tmp25220 = getelementptr inbounds float* undef, i64 1
+  %tmp25221 = getelementptr inbounds float* undef, i64 1
+  %tmp25222 = getelementptr inbounds float* undef, i64 1
+  %tmp25223 = getelementptr inbounds float* undef, i64 1
+  %tmp25224 = getelementptr inbounds float* undef, i64 1
+  %tmp25225 = getelementptr inbounds float* undef, i64 1
+  %tmp25226 = getelementptr inbounds float* undef, i64 1
+  %tmp25227 = getelementptr inbounds float* undef, i64 1
+  %tmp25228 = getelementptr inbounds float* undef, i64 1
+  %tmp25229 = getelementptr inbounds float* undef, i64 1
+  %tmp25230 = getelementptr inbounds float* %tmp25229, i64 1
+  %tmp25231 = getelementptr inbounds float* undef, i64 1
+  %tmp25232 = getelementptr inbounds float* undef, i64 1
+  %tmp25233 = getelementptr inbounds float* undef, i64 1
+  %tmp25234 = getelementptr inbounds float* undef, i64 1
+  %tmp25235 = getelementptr inbounds float* %tmp25234, i64 1
+  %tmp25236 = getelementptr inbounds float* undef, i64 1
+  %tmp25237 = getelementptr inbounds float* %tmp25236, i64 1
+  %tmp25238 = getelementptr inbounds float* undef, i64 1
+  %tmp25239 = getelementptr inbounds float* undef, i64 1
+  %tmp25240 = getelementptr inbounds float* undef, i64 1
+  %tmp25241 = getelementptr inbounds float* undef, i64 1
+  %tmp25242 = getelementptr inbounds float* undef, i64 1
+  %tmp25243 = getelementptr inbounds float* undef, i64 1
+  %tmp25244 = getelementptr inbounds float* undef, i64 1
+  %tmp25245 = getelementptr inbounds float* undef, i64 1
+  %tmp25246 = getelementptr inbounds float* undef, i64 1
+  %tmp25247 = getelementptr inbounds float* undef, i64 1
+  %tmp25248 = getelementptr inbounds float* %tmp25247, i64 1
+  %tmp25249 = getelementptr inbounds float* undef, i64 1
+  %tmp25250 = getelementptr inbounds float* undef, i64 1
+  %tmp25251 = getelementptr inbounds float* undef, i64 1
+  %tmp25252 = getelementptr inbounds float* undef, i64 1
+  %tmp25253 = getelementptr inbounds float* undef, i64 1
+  %tmp25254 = getelementptr inbounds float* undef, i64 1
+  %tmp25255 = getelementptr inbounds float* undef, i64 1
+  %tmp25256 = getelementptr inbounds float* undef, i64 1
+  %tmp25257 = getelementptr inbounds float* undef, i64 1
+  %tmp25258 = getelementptr inbounds float* undef, i64 1
+  %tmp25259 = getelementptr inbounds float* undef, i64 1
+  %tmp25260 = getelementptr inbounds float* undef, i64 1
+  %tmp25261 = getelementptr inbounds float* undef, i64 1
+  %tmp25262 = getelementptr inbounds float* undef, i64 1
+  %tmp25263 = getelementptr inbounds float* undef, i64 1
+  %tmp25264 = getelementptr inbounds float* undef, i64 1
+  %tmp25265 = getelementptr inbounds float* undef, i64 1
+  %tmp25266 = getelementptr inbounds float* undef, i64 1
+  %tmp25267 = getelementptr inbounds float* undef, i64 1
+  %tmp25268 = getelementptr inbounds float* undef, i64 1
+  %tmp25269 = getelementptr inbounds float* undef, i64 1
+  br i1 undef, label %bb25270, label %bb25271
+
+bb25270:                                          ; preds = %bb2
+  br label %bb25362
+
+bb25271:                                          ; preds = %bb2
+  br label %bb25272
+
+bb25272:                                          ; preds = %bb25275, %bb25271
+  br i1 false, label %bb25273, label %bb25278
+
+bb25273:                                          ; preds = %bb25272
+  invoke void @foo()
+          to label %bb25274 unwind label %bb25276
+
+bb25274:                                          ; preds = %bb25273
+  invoke void @bar()
+          to label %bb25275 unwind label %bb25276
+
+bb25275:                                          ; preds = %bb25274
+  br label %bb25272
+
+bb25276:                                          ; preds = %bb25283, %bb25274, %bb25273
+  %tmp25277 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %bb25361
+
+bb25278:                                          ; preds = %bb25272
+  br label %bb25279
+
+bb25279:                                          ; preds = %bb25284, %bb25278
+  br i1 undef, label %bb25280, label %bb25285
+
+bb25280:                                          ; preds = %bb25279
+  br label %bb25281
+
+bb25281:                                          ; preds = %bb25282, %bb25280
+  br i1 undef, label %bb25282, label %bb25283
+
+bb25282:                                          ; preds = %bb25281
+  br label %bb25281
+
+bb25283:                                          ; preds = %bb25281
+  invoke void @bar()
+          to label %bb25284 unwind label %bb25276
+
+bb25284:                                          ; preds = %bb25283
+  br label %bb25279
+
+bb25285:                                          ; preds = %bb25279
+  br label %bb25286
+
+bb25286:                                          ; preds = %bb25303, %bb25285
+  br i1 undef, label %bb25287, label %bb25304
+
+bb25287:                                          ; preds = %bb25286
+  invoke void @bar()
+          to label %bb25288 unwind label %bb25298
+
+bb25288:                                          ; preds = %bb25287
+  br i1 undef, label %bb25289, label %bb25300
+
+bb25289:                                          ; preds = %bb25288
+  br i1 undef, label %bb25290, label %bb25300
+
+bb25290:                                          ; preds = %bb25289
+  invoke void @bar()
+          to label %bb25291 unwind label %bb25298
+
+bb25291:                                          ; preds = %bb25290
+  br i1 undef, label %bb25292, label %bb25295
+
+bb25292:                                          ; preds = %bb25291
+  br i1 undef, label %bb25294, label %bb25293
+
+bb25293:                                          ; preds = %bb25292
+  br label %bb25294
+
+bb25294:                                          ; preds = %bb25293, %bb25292
+  br label %bb25296
+
+bb25295:                                          ; preds = %bb25291
+  invoke void @quuuux()
+          to label %bb25296 unwind label %bb25298
+
+bb25296:                                          ; preds = %bb25295, %bb25294
+  invoke void @baz()
+          to label %bb25297 unwind label %bb25298
+
+bb25297:                                          ; preds = %bb25296
+  br label %bb25300
+
+bb25298:                                          ; preds = %bb25296, %bb25295, %bb25290, %bb25287
+  %tmp25299 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %bb25360
+
+bb25300:                                          ; preds = %bb25297, %bb25289, %bb25288
+  br i1 undef, label %bb25301, label %bb25302
+
+bb25301:                                          ; preds = %bb25300
+  br label %bb25303
+
+bb25302:                                          ; preds = %bb25300
+  br label %bb25303
+
+bb25303:                                          ; preds = %bb25302, %bb25301
+  br label %bb25286
+
+bb25304:                                          ; preds = %bb25286
+  br label %bb25305
+
+bb25305:                                          ; preds = %bb25331, %bb25304
+  br i1 undef, label %bb25306, label %bb25332
+
+bb25306:                                          ; preds = %bb25305
+  invoke void @quuux()
+          to label %bb25307 unwind label %bb25324
+
+bb25307:                                          ; preds = %bb25306
+  invoke void @quux()
+          to label %bb25308 unwind label %bb25324
+
+bb25308:                                          ; preds = %bb25307
+  br i1 undef, label %bb25309, label %bb25330
+
+bb25309:                                          ; preds = %bb25308
+  br i1 undef, label %bb25310, label %bb25330
+
+bb25310:                                          ; preds = %bb25309
+  br i1 undef, label %bb25311, label %bb25317
+
+bb25311:                                          ; preds = %bb25310
+  br label %bb25312
+
+bb25312:                                          ; preds = %bb25316, %bb25315, %bb25311
+  br i1 undef, label %bb25313, label %bb25317
+
+bb25313:                                          ; preds = %bb25312
+  %tmp25314 = invoke zeroext i1 undef(%0* undef, %0* undef)
+          to label %bb25315 unwind label %bb25324
+
+bb25315:                                          ; preds = %bb25313
+  br i1 %tmp25314, label %bb25316, label %bb25312
+
+bb25316:                                          ; preds = %bb25315
+  br label %bb25312
+
+bb25317:                                          ; preds = %bb25312, %bb25310
+  br i1 undef, label %bb25318, label %bb25326
+
+bb25318:                                          ; preds = %bb25317
+  br i1 undef, label %bb25319, label %bb25326
+
+bb25319:                                          ; preds = %bb25318
+  br i1 undef, label %bb25320, label %bb25323
+
+bb25320:                                          ; preds = %bb25319
+  br i1 undef, label %bb25322, label %bb25321
+
+bb25321:                                          ; preds = %bb25320
+  br label %bb25322
+
+bb25322:                                          ; preds = %bb25321, %bb25320
+  br label %bb25326
+
+bb25323:                                          ; preds = %bb25319
+  invoke void @qux()
+          to label %bb25326 unwind label %bb25324
+
+bb25324:                                          ; preds = %bb25357, %bb25344, %bb25343, %bb25342, %bb25337, %bb25334, %bb25333, %bb25323, %bb25313, %bb25307, %bb25306
+  %tmp25325 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %bb25359
+
+bb25326:                                          ; preds = %bb25323, %bb25322, %bb25318, %bb25317
+  br label %bb25327
+
+bb25327:                                          ; preds = %bb25328, %bb25326
+  br i1 undef, label %bb25328, label %bb25329
+
+bb25328:                                          ; preds = %bb25327
+  br label %bb25327
+
+bb25329:                                          ; preds = %bb25327
+  br label %bb25330
+
+bb25330:                                          ; preds = %bb25329, %bb25309, %bb25308
+  br i1 undef, label %bb25332, label %bb25331
+
+bb25331:                                          ; preds = %bb25330
+  br label %bb25305
+
+bb25332:                                          ; preds = %bb25330, %bb25305
+  br i1 undef, label %bb25333, label %bb25357
+
+bb25333:                                          ; preds = %bb25332
+  invoke void (...)* @printf()
+          to label %bb25334 unwind label %bb25324
+
+bb25334:                                          ; preds = %bb25333
+  invoke void (...)* @printf(i32 undef)
+          to label %bb25335 unwind label %bb25324
+
+bb25335:                                          ; preds = %bb25334
+  br label %bb25336
+
+bb25336:                                          ; preds = %bb25338, %bb25335
+  br i1 undef, label %bb25337, label %bb25339
+
+bb25337:                                          ; preds = %bb25336
+  invoke void (...)* @printf(i32 undef, double undef)
+          to label %bb25338 unwind label %bb25324
+
+bb25338:                                          ; preds = %bb25337
+  br label %bb25336
+
+bb25339:                                          ; preds = %bb25336
+  br label %bb25340
+
+bb25340:                                          ; preds = %bb25341, %bb25339
+  br i1 undef, label %bb25341, label %bb25342
+
+bb25341:                                          ; preds = %bb25340
+  br label %bb25340
+
+bb25342:                                          ; preds = %bb25340
+  invoke void (...)* @printf()
+          to label %bb25343 unwind label %bb25324
+
+bb25343:                                          ; preds = %bb25342
+  invoke void (...)* @printf(double undef, double undef)
+          to label %bb25344 unwind label %bb25324
+
+bb25344:                                          ; preds = %bb25343
+  invoke void @mux()
+          to label %bb25345 unwind label %bb25324
+
+bb25345:                                          ; preds = %bb25344
+  br label %bb25346
+
+bb25346:                                          ; preds = %bb25347, %bb25345
+  br i1 undef, label %bb25347, label %bb25348
+
+bb25347:                                          ; preds = %bb25346
+  br label %bb25346
+
+bb25348:                                          ; preds = %bb25346
+  br label %bb25349
+
+bb25349:                                          ; preds = %bb25350, %bb25348
+  br i1 undef, label %bb25350, label %bb25351
+
+bb25350:                                          ; preds = %bb25349
+  br label %bb25349
+
+bb25351:                                          ; preds = %bb25349
+  invoke void (...)* @printf()
+          to label %bb25352 unwind label %bb25355
+
+bb25352:                                          ; preds = %bb25351
+  invoke void (...)* @printf(double undef)
+          to label %bb25353 unwind label %bb25355
+
+bb25353:                                          ; preds = %bb25352
+  invoke void (...)* @printf()
+          to label %bb25354 unwind label %bb25355
+
+bb25354:                                          ; preds = %bb25353
+  br label %bb25358
+
+bb25355:                                          ; preds = %bb25353, %bb25352, %bb25351
+  %tmp25356 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  br label %bb25359
+
+bb25357:                                          ; preds = %bb25332
+  invoke void (...)* @printf()
+          to label %bb25358 unwind label %bb25324
+
+bb25358:                                          ; preds = %bb25357, %bb25354
+  br label %bb25362
+
+bb25359:                                          ; preds = %bb25355, %bb25324
+  br label %bb25360
+
+bb25360:                                          ; preds = %bb25359, %bb25298
+  br label %bb25361
+
+bb25361:                                          ; preds = %bb25360, %bb25276
+  resume { i8*, i32 } undef
+
+bb25362:                                          ; preds = %bb25358, %bb25270, %bb1
+  ret void
+}
+
+declare void @foo()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @bar() uwtable ssp align 2
+
+declare hidden void @baz() uwtable ssp align 2
+
+declare void @printf(...)
+
+declare void @mux() unnamed_addr uwtable ssp align 2
+
+declare hidden void @qux() uwtable ssp align 2
+
+declare void @quux() uwtable ssp
+
+declare void @quuux() uwtable ssp
+
+declare hidden void @quuuux() uwtable ssp align 2
diff --git a/test/CodeGen/X86/lea-recursion.ll b/test/CodeGen/X86/lea-recursion.ll
index 3f32fd2..9480600 100644
--- a/test/CodeGen/X86/lea-recursion.ll
+++ b/test/CodeGen/X86/lea-recursion.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep lea | count 12
+; RUN: llc < %s -march=x86-64 | grep lea | count 13
 
 ; This testcase was written to demonstrate an instruction-selection problem,
 ; however it also happens to expose a limitation in the DAGCombiner's
@@ -44,4 +44,3 @@ entry:
 	store i32 %tmp10.6, i32* getelementptr ([1000 x i32]* @g0, i32 0, i32 7)
 	ret void
 }
-
diff --git a/test/CodeGen/X86/lea.ll b/test/CodeGen/X86/lea.ll
index affd6bf..93cfe46 100644
--- a/test/CodeGen/X86/lea.ll
+++ b/test/CodeGen/X86/lea.ll
@@ -28,8 +28,7 @@ bb.nph:
 bb2:
 	ret i32 %x_offs
 ; CHECK-LABEL: test2:
-; CHECK: movl %e[[A0]], %eax
-; CHECK: addl $-5, %eax
+; CHECK:        leal    -5(%r[[A0:..]]), %eax
 ; CHECK:	andl	$-4, %eax
 ; CHECK:	negl	%eax
 ; CHECK:	leal	-4(%r[[A0]],%rax), %eax
diff --git a/test/CodeGen/X86/leaf-fp-elim.ll b/test/CodeGen/X86/leaf-fp-elim.ll
index 7eebf8d..1bb3c75 100644
--- a/test/CodeGen/X86/leaf-fp-elim.ll
+++ b/test/CodeGen/X86/leaf-fp-elim.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-apple-darwin11.0"
 @msg = internal global i8* null                   ; <i8**> [#uses=1]
 @.str = private constant [2 x i8] c"x\00", align 1 ; <[2 x i8]*> [#uses=1]
 
-define void @test(i8* %p) "no-frame-pointer-elim-non-leaf"="true" nounwind optsize ssp {
+define void @test(i8* %p) "no-frame-pointer-elim-non-leaf" nounwind optsize ssp {
 
 ; No stack frame, please.
 ; CHECK:     _test
diff --git a/test/CodeGen/X86/legalize-shift-64.ll b/test/CodeGen/X86/legalize-shift-64.ll
index 7736468..64460bb 100644
--- a/test/CodeGen/X86/legalize-shift-64.ll
+++ b/test/CodeGen/X86/legalize-shift-64.ll
@@ -64,3 +64,31 @@ define <2 x i64> @test5(<2 x i64> %A, <2 x i64> %B) {
 ; CHECK: shl
 ; CHECK: shldl
 }
+
+; PR16108
+define i32 @test6() {
+  %x = alloca i32, align 4
+  %t = alloca i64, align 8
+  store i32 1, i32* %x, align 4
+  store i64 1, i64* %t, align 8  ;; DEAD
+  %load = load i32* %x, align 4
+  %shl = shl i32 %load, 8
+  %add = add i32 %shl, -224
+  %sh_prom = zext i32 %add to i64
+  %shl1 = shl i64 1, %sh_prom
+  %cmp = icmp ne i64 %shl1, 4294967296
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  ret i32 1
+
+if.end:                                           ; preds = %entry
+  ret i32 0
+
+; CHECK-LABEL: test6:
+; CHECK-NOT: andb $31
+; CHECK: sete
+; CHECK: movzbl
+; CHECK: xorl $1
+; CHECK: orl
+}
diff --git a/test/CodeGen/X86/lit.local.cfg b/test/CodeGen/X86/lit.local.cfg
index 9d285bf..1637fa4 100644
--- a/test/CodeGen/X86/lit.local.cfg
+++ b/test/CodeGen/X86/lit.local.cfg
@@ -1,4 +1,10 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
+# FIXME: For now, override suffixes to exclude any .s tests, because some of the
+# buildbots have a stray misched-copy.s output file lying around that causes
+# failures. See misched-copy.s where we try and clean up that file.
+#
+# It should be possible to remove this override once all the bots have cycled
+# cleanly.
+config.suffixes = ['.ll', '.c', '.cpp', '.test', '.txt']
 
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
diff --git a/test/CodeGen/X86/load-slice.ll b/test/CodeGen/X86/load-slice.ll
new file mode 100644
index 0000000..85fd7f0
--- /dev/null
+++ b/test/CodeGen/X86/load-slice.ll
@@ -0,0 +1,139 @@
+; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
+; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR
+;
+; <rdar://problem/14477220>
+
+%class.Complex = type { float, float }
+
+
+; Check that independant slices leads to independant loads then the slices leads to
+; different register file.
+;
+; The layout is:
+; LSB 0 1 2 3 | 4 5 6 7 MSB
+;       Low      High
+; The base address points to 0 and is 8-bytes aligned.
+; Low slice starts at 0 (base) and is 8-bytes aligned.
+; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
+;
+; STRESS-LABEL: t1:
+; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
+; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
+; Add low slice: out[out_start].real, this is base + 0.
+; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
+; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
+; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
+; Add high slice: out[out_start].imm, this is base + 4.
+; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
+; Swap Imm and Real.
+; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
+; Put the results back into out[out_start].
+; STRESS-NEXT: vmovq [[RES_Vec]], ([[BASE]])
+;
+; Same for REGULAR, we eliminate register bank copy with each slices.
+; REGULAR-LABEL: t1:
+; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
+; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
+; Add low slice: out[out_start].real, this is base + 0.
+; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
+; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
+; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
+; Add high slice: out[out_start].imm, this is base + 4.
+; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
+; Swap Imm and Real.
+; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
+; Put the results back into out[out_start].
+; REGULAR-NEXT: vmovq [[RES_Vec]], ([[BASE]])
+define void @t1(%class.Complex* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %tmp = bitcast %class.Complex* %arrayidx to i64*
+  %tmp1 = load i64* %tmp, align 8
+  %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
+  %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
+  %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
+  %tmp4 = load float* %i.i, align 4
+  %add.i = fadd float %tmp4, %tmp2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
+  %tmp5 = load float* %r.i, align 4
+  %add5.i = fadd float %tmp5, %tmp3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
+  store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+; Check that we do not read outside of the chunk of bits of the original loads.
+;
+; The 64-bits should have been split in one 32-bits and one 16-bits slices.
+; The 16-bits should be zero extended to match the final type.
+;
+; The memory layout is:
+; LSB 0 1 2 3 | 4 5 | 6 7 MSB
+;      Low            High
+; The base address points to 0 and is 8-bytes aligned.
+; Low slice starts at 0 (base) and is 8-bytes aligned.
+; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
+;
+; STRESS-LABEL: t2:
+; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
+; STRESS-NEXT: addl ([[BASE]]), %eax
+; STRESS-NEXT: ret
+;
+; For the REGULAR heuristic, this is not profitable to slice things that are not
+; next to each other in memory. Here we have a hole with bytes #4-5.
+; REGULAR-LABEL: t2:
+; REGULAR: shrq $48
+define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
+  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %bitcast = bitcast %class.Complex* %arrayidx to i64*
+  %chunk64 = load i64* %bitcast, align 8
+  %slice32_low = trunc i64 %chunk64 to i32
+  %shift48 = lshr i64 %chunk64, 48
+  %slice32_high = trunc i64 %shift48 to i32
+  %res = add i32 %slice32_high, %slice32_low
+  ret i32 %res
+}
+
+; Check that we do not optimize overlapping slices.
+;
+; The 64-bits should NOT have been split in as slices are overlapping.
+; First slice uses bytes numbered 0 to 3.
+; Second slice uses bytes numbered 6 and 7.
+; Third slice uses bytes numbered 4 to 7.
+;
+; STRESS-LABEL: t3:
+; STRESS: shrq $48
+; STRESS: shrq $32
+;
+; REGULAR-LABEL: t3:
+; REGULAR: shrq $48
+; REGULAR: shrq $32
+define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
+  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %bitcast = bitcast %class.Complex* %arrayidx to i64*
+  %chunk64 = load i64* %bitcast, align 8
+  %slice32_low = trunc i64 %chunk64 to i32
+  %shift48 = lshr i64 %chunk64, 48
+  %slice32_high = trunc i64 %shift48 to i32
+  %shift32 = lshr i64 %chunk64, 32
+  %slice32_lowhigh = trunc i64 %shift32 to i32
+  %tmpres = add i32 %slice32_high, %slice32_low
+  %res = add i32 %slice32_lowhigh, %tmpres
+  ret i32 %res
+}
diff --git a/test/CodeGen/X86/long-extend.ll b/test/CodeGen/X86/long-extend.ll
new file mode 100644
index 0000000..5bbd41d
--- /dev/null
+++ b/test/CodeGen/X86/long-extend.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mcpu=core-avx-i -mtriple=x86_64-linux -asm-verbose=0| FileCheck %s
+define void @test_long_extend(<16 x i8> %a, <16 x i32>* %p) nounwind {
+; CHECK-LABEL: test_long_extend
+; CHECK: vpunpcklbw	%xmm1, %xmm0, [[REG1:%xmm[0-9]+]]
+; CHECK: vpunpckhwd	%xmm1, [[REG1]], [[REG2:%xmm[0-9]+]]
+; CHECK: vpunpcklwd	%xmm1, [[REG1]], %x[[REG3:mm[0-9]+]]
+; CHECK: vinsertf128	$1, [[REG2]], %y[[REG3]], [[REG_result0:%ymm[0-9]+]]
+; CHECK: vpunpckhbw	%xmm1, %xmm0, [[REG4:%xmm[0-9]+]]
+; CHECK: vpunpckhwd	%xmm1, [[REG4]], [[REG5:%xmm[0-9]+]]
+; CHECK: vpunpcklwd	%xmm1, [[REG4]], %x[[REG6:mm[0-9]+]]
+; CHECK: vinsertf128	$1, [[REG5]], %y[[REG6]], [[REG_result1:%ymm[0-9]+]]
+; CHECK: vmovaps	[[REG_result1]], 32(%rdi)
+; CHECK: vmovaps	[[REG_result0]], (%rdi)
+
+  %tmp = zext <16 x i8> %a to <16 x i32>
+  store <16 x i32> %tmp, <16 x i32>*%p
+  ret void
+}
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll
index c7a3186..e7d74a9 100644
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -mtriple=x86_64-darwin -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK-LABEL: t:
-; CHECK: decq
-; CHECK-NEXT: movl (%r9,%rax,4), %eax
+; CHECK: movl (%r9,%rax,4), %e{{..}}
+; CHECK-NEXT: decq
 ; CHECK-NEXT: jne
 
 ; ATOM-LABEL: t:
-; ATOM: movl (%r9,%r{{.+}},4), %eax
+; ATOM: movl (%r9,%r{{.+}},4), %e{{..}}
 ; ATOM-NEXT: decq
 ; ATOM-NEXT: jne
 
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll
index c33cac2..4a4d178 100644
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -1,15 +1,13 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 > %t
-; RUN: not grep and %t
-; RUN: not grep movz %t
-; RUN: not grep sar %t
-; RUN: not grep shl %t
-; RUN: grep add %t | count 5
-; RUN: grep inc %t | count 2
-; RUN: grep lea %t | count 3
+; RUN: llc < %s -mcpu=generic -march=x86-64 | FileCheck %s
 
 ; Optimize away zext-inreg and sext-inreg on the loop induction
 ; variable using trip-count information.
 
+; CHECK-LABEL: count_up
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: inc
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_up(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -38,6 +36,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: count_down
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_down(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -66,6 +69,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: count_up_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: inc
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_up_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -96,6 +104,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: count_down_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @count_down_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -126,6 +139,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: another_count_up
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_up(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -154,6 +172,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: another_count_down
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: decq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_down(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -182,6 +205,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: another_count_up_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: addq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_up_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
@@ -212,6 +240,11 @@ return:
 	ret void
 }
 
+; CHECK-LABEL: another_count_down_signed
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: decq
+; CHECK-NOT: {{and|movz|sar|shl}}
+; CHECK: jne
 define void @another_count_down_signed(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
diff --git a/test/CodeGen/X86/maskmovdqu.ll b/test/CodeGen/X86/maskmovdqu.ll
index 7796f0e..0b3334d 100644
--- a/test/CodeGen/X86/maskmovdqu.ll
+++ b/test/CodeGen/X86/maskmovdqu.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=x86    -mattr=+sse2 | grep -i EDI
-; RUN: llc < %s -march=x86-64 -mattr=+sse2 | grep -i RDI
+; RUN: llc < %s -march=x86    -mattr=+sse2,-avx | grep -i EDI
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,-avx | grep -i RDI
+; RUN: llc < %s -march=x86    -mattr=+avx | grep -i EDI
+; RUN: llc < %s -march=x86-64 -mattr=+avx | grep -i RDI
 ; rdar://6573467
 
 define void @test(<16 x i8> %a, <16 x i8> %b, i32 %dummy, i8* %c) nounwind {
diff --git a/test/CodeGen/X86/mcinst-avx-lowering.ll b/test/CodeGen/X86/mcinst-avx-lowering.ll
index 41f96e8..db72e08 100644
--- a/test/CodeGen/X86/mcinst-avx-lowering.ll
+++ b/test/CodeGen/X86/mcinst-avx-lowering.ll
@@ -4,7 +4,7 @@ define i64 @t1(double %d_ivar) nounwind uwtable ssp {
 entry:
 ; CHECK: t1
   %0 = bitcast double %d_ivar to i64
-; CHECK: vmovd
+; CHECK: vmovq
 ; CHECK: encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
   ret i64 %0
 }
@@ -13,7 +13,7 @@ define double @t2(i64 %d_ivar) nounwind uwtable ssp {
 entry:
 ; CHECK: t2
   %0 = bitcast i64 %d_ivar to double
-; CHECK: vmovd
+; CHECK: vmovq
 ; CHECK: encoding: [0xc4,0xe1,0xf9,0x6e,0xc7]
   ret double %0
 }
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index c17cc7f..6ae7807 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -56,15 +56,15 @@ entry:
 define void @t2(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
 entry:
 ; SSE2-Darwin-LABEL: t2:
-; SSE2-Darwin: movaps (%eax), %xmm0
+; SSE2-Darwin: movaps (%ecx), %xmm0
 ; SSE2-Darwin: movaps %xmm0, (%eax)
 
 ; SSE2-Mingw32-LABEL: t2:
-; SSE2-Mingw32: movaps (%eax), %xmm0
+; SSE2-Mingw32: movaps (%ecx), %xmm0
 ; SSE2-Mingw32: movaps %xmm0, (%eax)
 
 ; SSE1-LABEL: t2:
-; SSE1: movaps (%eax), %xmm0
+; SSE1: movaps (%ecx), %xmm0
 ; SSE1: movaps %xmm0, (%eax)
 
 ; NOSSE-LABEL: t2:
@@ -91,14 +91,14 @@ entry:
 define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp {
 entry:
 ; SSE2-Darwin-LABEL: t3:
-; SSE2-Darwin: movsd (%eax), %xmm0
-; SSE2-Darwin: movsd 8(%eax), %xmm1
+; SSE2-Darwin: movsd (%ecx), %xmm0
+; SSE2-Darwin: movsd 8(%ecx), %xmm1
 ; SSE2-Darwin: movsd %xmm1, 8(%eax)
 ; SSE2-Darwin: movsd %xmm0, (%eax)
 
 ; SSE2-Mingw32-LABEL: t3:
-; SSE2-Mingw32: movsd (%eax), %xmm0
-; SSE2-Mingw32: movsd 8(%eax), %xmm1
+; SSE2-Mingw32: movsd (%ecx), %xmm0
+; SSE2-Mingw32: movsd 8(%ecx), %xmm1
 ; SSE2-Mingw32: movsd %xmm1, 8(%eax)
 ; SSE2-Mingw32: movsd %xmm0, (%eax)
 
diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll
new file mode 100644
index 0000000..940688c
--- /dev/null
+++ b/test/CodeGen/X86/merge_store.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
+
+define void @merge_store(i32* nocapture %a) {
+; CHECK-LABEL: merge_store:
+; CHECK: movq
+; CHECK: movq
+entry:
+  br label %for.body
+
+  for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  store i32 1, i32* %arrayidx, align 4
+  %0 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds i32* %a, i64 %0
+  store i32 1, i32* %arrayidx2, align 4
+  %1 = or i64 %indvars.iv, 2
+  %arrayidx5 = getelementptr inbounds i32* %a, i64 %1
+  store i32 1, i32* %arrayidx5, align 4
+  %2 = or i64 %indvars.iv, 3
+  %arrayidx8 = getelementptr inbounds i32* %a, i64 %2
+  store i32 1, i32* %arrayidx8, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  %3 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %3, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+  for.end:
+  ret void
+}
diff --git a/test/CodeGen/X86/mingw-alloca.ll b/test/CodeGen/X86/mingw-alloca.ll
index ded4b73..72b6940 100644
--- a/test/CodeGen/X86/mingw-alloca.ll
+++ b/test/CodeGen/X86/mingw-alloca.ll
@@ -1,12 +1,14 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc < %s -mtriple=i386-pc-mingw32      | FileCheck %s -check-prefix=COFF
+; RUN: llc < %s -mtriple=i386-pc-mingw32-elf  | FileCheck %s -check-prefix=ELF
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
-target triple = "i386-pc-mingw32"
 
 define void @foo1(i32 %N) nounwind {
 entry:
-; CHECK: _foo1:
-; CHECK: calll __alloca
+; COFF: _foo1:
+; COFF: calll __alloca
+; ELF: foo1:
+; ELF: calll _alloca
 	%tmp14 = alloca i32, i32 %N		; <i32*> [#uses=1]
 	call void @bar1( i32* %tmp14 )
 	ret void
@@ -16,11 +18,16 @@ declare void @bar1(i32*)
 
 define void @foo2(i32 inreg  %N) nounwind {
 entry:
-; CHECK: _foo2:
-; CHECK: andl $-16, %esp
-; CHECK: pushl %eax
-; CHECK: calll __alloca
-; CHECK: movl	8028(%esp), %eax
+; COFF: _foo2:
+; COFF: andl $-16, %esp
+; COFF: pushl %eax
+; COFF: calll __alloca
+; COFF: movl	8028(%esp), %eax
+; ELF: foo2:
+; ELF: andl $-16, %esp
+; ELF: pushl %eax
+; ELF: calll _alloca
+; ELF: movl	8028(%esp), %eax
 	%A2 = alloca [2000 x i32], align 16		; <[2000 x i32]*> [#uses=1]
 	%A2.sub = getelementptr [2000 x i32]* %A2, i32 0, i32 0		; <i32*> [#uses=1]
 	call void @bar2( i32* %A2.sub, i32 %N )
diff --git a/test/CodeGen/X86/misched-balance.ll b/test/CodeGen/X86/misched-balance.ll
index 5f6c501..1900802 100644
--- a/test/CodeGen/X86/misched-balance.ll
+++ b/test/CodeGen/X86/misched-balance.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-machineinstrs | FileCheck %s
 ;
 ; Verify that misched resource/latency balancy heuristics are sane.
 
@@ -15,7 +15,7 @@ entry:
 ; Since mmult1 IR is already in good order, this effectively ensure
 ; the scheduler maintains source order.
 ;
-; CHECK: %for.body
+; CHECK-LABEL: %for.body
 ; CHECK-NOT: %rsp
 ; CHECK: imull 4
 ; CHECK-NOT: {{imull|rsp}}
@@ -45,7 +45,7 @@ entry:
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
 ; CHECK-NOT: {{imull|rsp}}
-; CHECK: %end
+; CHECK-LABEL: %end
 for.body:
   %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
   %tmp57 = load i32* %tmp56, align 4
@@ -120,7 +120,7 @@ end:
 ; Unlike the above loop, this IR starts out bad and must be
 ; rescheduled.
 ;
-; CHECK: %for.body
+; CHECK-LABEL: %for.body
 ; CHECK-NOT: %rsp
 ; CHECK: imull 4
 ; CHECK-NOT: {{imull|rsp}}
@@ -150,7 +150,7 @@ end:
 ; CHECK-NOT: {{imull|rsp}}
 ; CHECK: addl
 ; CHECK-NOT: {{imull|rsp}}
-; CHECK: %end
+; CHECK-LABEL: %end
 define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
   i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
   i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
@@ -232,8 +232,8 @@ end:
 ; balanced heuristics are interesting here because we have resource,
 ; latency, and register limits all at once. For now, simply check that
 ; we don't use any callee-saves.
-; CHECK: @encpc1
-; CHECK: %entry
+; CHECK-LABEL: @encpc1
+; CHECK-LABEL: %entry
 ; CHECK-NOT: push
 ; CHECK-NOT: pop
 ; CHECK: ret
diff --git a/test/CodeGen/X86/misched-copy.ll b/test/CodeGen/X86/misched-copy.ll
index 0450cfb..4485b8a 100644
--- a/test/CodeGen/X86/misched-copy.ll
+++ b/test/CodeGen/X86/misched-copy.ll
@@ -8,11 +8,11 @@
 ; MUL_HiLo PhysReg use copies should be just above the mul.
 ; MUL_HiLo PhysReg def copies should be just below the mul.
 ;
-; CHECK:      *** Final schedule for BB#1 ***
-; CHECK-NEXT: %EAX<def> = COPY
-; CHECK:      MUL32r %vreg{{[0-9]+}}, %EAX<imp-def>, %EDX<imp-def>, %EFLAGS<imp-def,dead>, %EAX<imp-use>;
-; CHECK-NEXT: COPY %E{{[AD]}}X;
-; CHECK-NEXT: COPY %E{{[AD]}}X;
+; CHECK: *** Final schedule for BB#1 ***
+; CHECK:      %EAX<def> = COPY
+; CHECK-NEXT: MUL32r %vreg{{[0-9]+}}, %EAX<imp-def>, %EDX<imp-def>, %EFLAGS<imp-def,dead>, %EAX<imp-use>;
+; CHECK-NEXT: COPY %E{{[AD]}}X
+; CHECK-NEXT: COPY %E{{[AD]}}X
 ; CHECK:      DIVSSrm
 define i64 @mulhoist(i32 %a, i32 %b) #0 {
 entry:
@@ -42,7 +42,7 @@ end:
   ret i64 %add
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !0 = metadata !{metadata !"float", metadata !1}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll
index 6b67607..5454b7c 100644
--- a/test/CodeGen/X86/misched-matmul.ll
+++ b/test/CodeGen/X86/misched-matmul.ll
@@ -3,11 +3,14 @@
 ;
 ; Verify that register pressure heuristics are working in MachineScheduler.
 ;
-; When we enable subtree scheduling heuristics on X86, we may need a
-; flag to disable it for this test case.
+; We can further reduce spills in this case with a global register
+; pressure heuristic, like sethi-ullman numbers or biasing toward
+; scheduled subtrees. However, these heuristics are marginally
+; beneficial on x86_64 and exacerbate register pressure in other
+; more complex cases.
 ;
 ; CHECK: @wrap_mul4
-; CHECK: 22 regalloc - Number of spills inserted
+; CHECK: 23 regalloc - Number of spills inserted
 
 define void @wrap_mul4(double* nocapture %Out, [4 x double]* nocapture %A, [4 x double]* nocapture %B) #0 {
 entry:
@@ -221,4 +224,4 @@ entry:
   ret void
 }
 
-attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll
index 4dc95c5..23b561f 100644
--- a/test/CodeGen/X86/misched-matrix.ll
+++ b/test/CodeGen/X86/misched-matrix.ll
@@ -15,19 +15,19 @@
 ; been reordered with the stores. This tests the scheduler's cheap
 ; alias analysis ability (that doesn't require any AliasAnalysis pass).
 ;
-; TOPDOWN: %for.body
+; TOPDOWN-LABEL: %for.body
 ; TOPDOWN: movl %{{.*}}, (
 ; TOPDOWN: imull {{[0-9]*}}(
 ; TOPDOWN: movl %{{.*}}, 4(
 ; TOPDOWN: imull {{[0-9]*}}(
 ; TOPDOWN: movl %{{.*}}, 8(
 ; TOPDOWN: movl %{{.*}}, 12(
-; TOPDOWN: %for.end
+; TOPDOWN-LABEL: %for.end
 ;
 ; For -misched=ilpmin, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are interleaved.
 ;
-; ILPMIN: %for.body
+; ILPMIN-LABEL: %for.body
 ; ILPMIN: movl %{{.*}}, (
 ; ILPMIN: imull
 ; ILPMIN: imull
@@ -53,12 +53,12 @@
 ; ILPMIN: imull
 ; ILPMIN: addl
 ; ILPMIN: movl %{{.*}}, 12(
-; ILPMIN: %for.end
+; ILPMIN-LABEL: %for.end
 ;
 ; For -misched=ilpmax, verify that each expression subtree is
 ; scheduled independently, and that the imull/adds are clustered.
 ;
-; ILPMAX: %for.body
+; ILPMAX-LABEL: %for.body
 ; ILPMAX: movl %{{.*}}, (
 ; ILPMAX: imull
 ; ILPMAX: imull
@@ -84,7 +84,7 @@
 ; ILPMAX: addl
 ; ILPMAX: addl
 ; ILPMAX: movl %{{.*}}, 12(
-; ILPMAX: %for.end
+; ILPMAX-LABEL: %for.end
 
 define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2,
 [4 x i32]* noalias nocapture %m3) nounwind uwtable ssp {
diff --git a/test/CodeGen/X86/mmx-builtins.ll b/test/CodeGen/X86/mmx-builtins.ll
index f5b3f76..aabdd53 100644
--- a/test/CodeGen/X86/mmx-builtins.ll
+++ b/test/CodeGen/X86/mmx-builtins.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+ssse3 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+mmx,+ssse3,-avx | FileCheck %s
 ; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+ssse3,-avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
 
 declare x86_mmx @llvm.x86.ssse3.phadd.w(x86_mmx, x86_mmx) nounwind readnone
 
diff --git a/test/CodeGen/X86/mmx-punpckhdq.ll b/test/CodeGen/X86/mmx-punpckhdq.ll
index 206cb33..9e8f5bf 100644
--- a/test/CodeGen/X86/mmx-punpckhdq.ll
+++ b/test/CodeGen/X86/mmx-punpckhdq.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse42 -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse4.2 -mtriple=x86_64-apple-darwin10 | FileCheck %s
 ; There are no MMX operations in bork; promoted to XMM.
 
 define void @bork(<1 x i64>* %x) {
diff --git a/test/CodeGen/X86/movbe.ll b/test/CodeGen/X86/movbe.ll
index aa58c10..3f459be 100644
--- a/test/CodeGen/X86/movbe.ll
+++ b/test/CodeGen/X86/movbe.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=x86_64-linux -mcpu=atom < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-linux -mcpu=slm < %s | FileCheck %s -check-prefix=SLM
 
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 declare i64 @llvm.bswap.i64(i64) nounwind readnone
@@ -9,6 +10,8 @@ define void @test1(i32* nocapture %x, i32 %y) nounwind {
   ret void
 ; CHECK-LABEL: test1:
 ; CHECK: movbel	%esi, (%rdi)
+; SLM-LABEL: test1:
+; SLM: movbel	%esi, (%rdi)
 }
 
 define i32 @test2(i32* %x) nounwind {
@@ -17,6 +20,8 @@ define i32 @test2(i32* %x) nounwind {
   ret i32 %bswap
 ; CHECK-LABEL: test2:
 ; CHECK: movbel	(%rdi), %eax
+; SLM-LABEL: test2:
+; SLM: movbel	(%rdi), %eax
 }
 
 define void @test3(i64* %x, i64 %y) nounwind {
@@ -25,6 +30,8 @@ define void @test3(i64* %x, i64 %y) nounwind {
   ret void
 ; CHECK-LABEL: test3:
 ; CHECK: movbeq	%rsi, (%rdi)
+; SLM-LABEL: test3:
+; SLM: movbeq	%rsi, (%rdi)
 }
 
 define i64 @test4(i64* %x) nounwind {
@@ -33,4 +40,6 @@ define i64 @test4(i64* %x) nounwind {
   ret i64 %bswap
 ; CHECK-LABEL: test4:
 ; CHECK: movbeq	(%rdi), %rax
+; SLM-LABEL: test4:
+; SLM: movbeq	(%rdi), %rax
 }
diff --git a/test/CodeGen/X86/movgs.ll b/test/CodeGen/X86/movgs.ll
index d3930fa..71b0723 100644
--- a/test/CodeGen/X86/movgs.ll
+++ b/test/CodeGen/X86/movgs.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=sse41 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -mattr=sse41 | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -mattr=sse41 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=penryn -mattr=sse4.1 | FileCheck %s --check-prefix=X64
 
 define i32 @test1() nounwind readonly {
 entry:
diff --git a/test/CodeGen/X86/neg_fp.ll b/test/CodeGen/X86/neg_fp.ll
index 57164f2..efb02f8 100644
--- a/test/CodeGen/X86/neg_fp.ll
+++ b/test/CodeGen/X86/neg_fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 -o %t
+; RUN: llc < %s -march=x86 -mattr=+sse4.1 -o %t
 ; RUN: grep xorps %t | count 1
 
 ; Test that when we don't -enable-unsafe-fp-math, we don't do the optimization
diff --git a/test/CodeGen/X86/newline-and-quote.ll b/test/CodeGen/X86/newline-and-quote.ll
new file mode 100644
index 0000000..9206e9f
--- /dev/null
+++ b/test/CodeGen/X86/newline-and-quote.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s
+@"foo\22bar" = global i32 42
+; CHECK: .globl "foo\"bar"
+
+@"foo\0abar" = global i32 42
+; CHECK: .globl "foo\nbar"
diff --git a/test/CodeGen/X86/no-compact-unwind.ll b/test/CodeGen/X86/no-compact-unwind.ll
index 627f7da..991cd4e 100644
--- a/test/CodeGen/X86/no-compact-unwind.ll
+++ b/test/CodeGen/X86/no-compact-unwind.ll
@@ -1,4 +1,10 @@
-; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -disable-cfi | FileCheck %s
+; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -mcpu corei7 -filetype=obj -o - \
+; RUN:  | llvm-objdump -triple x86_64-apple-macosx10.8.0 -s - \
+; RUN:  | FileCheck -check-prefix=CU %s
+; RUN: llc < %s -mtriple x86_64-apple-darwin11 -mcpu corei7 \
+; RUN:  | llvm-mc -triple x86_64-apple-darwin11 -filetype=obj -o - \
+; RUN:  | llvm-objdump -triple x86_64-apple-darwin11 -s - \
+; RUN:  | FileCheck -check-prefix=FROM-ASM %s
 
 %"struct.dyld::MappedRanges" = type { [400 x %struct.anon], %"struct.dyld::MappedRanges"* }
 %struct.anon = type { %class.ImageLoader*, i64, i64 }
@@ -12,13 +18,15 @@ declare void @OSMemoryBarrier() optsize
 ; This compact unwind encoding indicates that we could not generate correct
 ; compact unwind encodings for this function. This then defaults to using the
 ; DWARF EH frame.
-;
-; CHECK: .section __LD,__compact_unwind,regular,debug
-; CHECK: .quad _func
-; CHECK: .long 67108864                ## Compact Unwind Encoding: 0x4000000
-; CHECK: .quad 0                       ## Personality Function
-; CHECK: .quad 0                       ## LSDA
-;
+
+; CU:      Contents of section __compact_unwind:
+; CU-NEXT: 0048 00000000 00000000 42000000 00000004
+; CU-NEXT: 0058 00000000 00000000 00000000 00000000
+
+; FROM-ASM:      Contents of section __compact_unwind:
+; FROM-ASM-NEXT: 0048 00000000 00000000 42000000 00000004
+; FROM-ASM-NEXT: 0058 00000000 00000000 00000000 00000000
+
 define void @func(%class.ImageLoader* %image) optsize ssp uwtable {
 entry:
   br label %for.cond1.preheader
diff --git a/test/CodeGen/X86/no-elf-compact-unwind.ll b/test/CodeGen/X86/no-elf-compact-unwind.ll
new file mode 100644
index 0000000..8a15817
--- /dev/null
+++ b/test/CodeGen/X86/no-elf-compact-unwind.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -disable-cfi | FileCheck -check-prefix=MACHO %s
+; RUN: llc < %s -mtriple x86_64-unknown-linux -disable-cfi | FileCheck -check-prefix=ELF %s
+
+; Make sure we don't generate a compact unwind for ELF.
+
+; MACHO-LABEL: _Z3barv:
+; MACHO:       __compact_unwind
+
+; ELF-LABEL:   _Z3barv:
+; ELF-NOT:     __compact_unwind
+
+@_ZTIi = external constant i8*
+
+define void @_Z3barv() uwtable {
+entry:
+  invoke void @_Z3foov()
+          to label %try.cont unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 1
+  %2 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
+  %matches = icmp eq i32 %1, %2
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %3 = extractvalue { i8*, i32 } %0, 0
+  %4 = tail call i8* @__cxa_begin_catch(i8* %3)
+  tail call void @__cxa_end_catch()
+  br label %try.cont
+
+try.cont:                                         ; preds = %entry, %catch
+  ret void
+
+eh.resume:                                        ; preds = %lpad
+  resume { i8*, i32 } %0
+}
+
+declare void @_Z3foov()
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
diff --git a/test/CodeGen/X86/nocx16.ll b/test/CodeGen/X86/nocx16.ll
new file mode 100644
index 0000000..cceaac4
--- /dev/null
+++ b/test/CodeGen/X86/nocx16.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=-cx16 | FileCheck %s
+define void @test(i128* %a) nounwind {
+entry:
+; CHECK: __sync_val_compare_and_swap_16
+  %0 = cmpxchg i128* %a, i128 1, i128 1 seq_cst
+; CHECK: __sync_lock_test_and_set_16
+  %1 = atomicrmw xchg i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_add_16
+  %2 = atomicrmw add i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_sub_16
+  %3 = atomicrmw sub i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_and_16
+  %4 = atomicrmw and i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_nand_16
+  %5 = atomicrmw nand i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_or_16
+  %6 = atomicrmw or i128* %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_xor_16
+  %7 = atomicrmw xor i128* %a, i128 1 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/X86/object-size.ll b/test/CodeGen/X86/object-size.ll
index 8f1eabd..ec35d29 100644
--- a/test/CodeGen/X86/object-size.ll
+++ b/test/CodeGen/X86/object-size.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-apple-darwin10.0"
 define void @bar() nounwind ssp {
 entry:
   %tmp = load i8** @p                             ; <i8*> [#uses=1]
-  %0 = call i64 @llvm.objectsize.i64(i8* %tmp, i1 0) ; <i64> [#uses=1]
+  %0 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp, i1 0) ; <i64> [#uses=1]
   %cmp = icmp ne i64 %0, -1                       ; <i1> [#uses=1]
 ; X64: movabsq $-1, [[RAX:%r..]]
 ; X64: cmpq    $-1, [[RAX]]
@@ -19,7 +19,7 @@ entry:
 cond.true:                                        ; preds = %entry
   %tmp1 = load i8** @p                            ; <i8*> [#uses=1]
   %tmp2 = load i8** @p                            ; <i8*> [#uses=1]
-  %1 = call i64 @llvm.objectsize.i64(i8* %tmp2, i1 1) ; <i64> [#uses=1]
+  %1 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp2, i1 1) ; <i64> [#uses=1]
   %call = call i8* @__strcpy_chk(i8* %tmp1, i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i64 %1) ssp ; <i8*> [#uses=1]
   br label %cond.end
 
@@ -33,7 +33,7 @@ cond.end:                                         ; preds = %cond.false, %cond.t
   ret void
 }
 
-declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readonly
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) nounwind readonly
 
 declare i8* @__strcpy_chk(i8*, i8*, i64) ssp
 
@@ -47,7 +47,7 @@ entry:
   %tmp = load i8** %__dest.addr                   ; <i8*> [#uses=1]
   %tmp1 = load i8** %__src.addr                   ; <i8*> [#uses=1]
   %tmp2 = load i8** %__dest.addr                  ; <i8*> [#uses=1]
-  %0 = call i64 @llvm.objectsize.i64(i8* %tmp2, i1 1) ; <i64> [#uses=1]
+  %0 = call i64 @llvm.objectsize.i64.p0i8(i8* %tmp2, i1 1) ; <i64> [#uses=1]
   %call = call i8* @__strcpy_chk(i8* %tmp, i8* %tmp1, i64 %0) ssp ; <i8*> [#uses=1]
   store i8* %call, i8** %retval
   %1 = load i8** %retval                          ; <i8*> [#uses=1]
diff --git a/test/CodeGen/X86/opt-shuff-tstore.ll b/test/CodeGen/X86/opt-shuff-tstore.ll
index 3e72084..fc43e81 100644
--- a/test/CodeGen/X86/opt-shuff-tstore.ll
+++ b/test/CodeGen/X86/opt-shuff-tstore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s  -mattr=+sse2,+sse41 | FileCheck %s
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s  -mattr=+sse2,+sse4.1 | FileCheck %s
 
 ; CHECK: func_4_8
 ; A single memory write
diff --git a/test/CodeGen/X86/palignr.ll b/test/CodeGen/X86/palignr.ll
index c76cbbe..ec6564d 100644
--- a/test/CodeGen/X86/palignr.ll
+++ b/test/CodeGen/X86/palignr.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mcpu=core2 -mattr=+ssse3 | FileCheck %s
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck --check-prefix=YONAH %s
+; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck --check-prefix=CHECK-YONAH %s
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll
new file mode 100644
index 0000000..d534639
--- /dev/null
+++ b/test/CodeGen/X86/patchpoint.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+
+; Trivial patchpoint codegen
+;
+define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: trivial_patchpoint_codegen:
+; CHECK:      movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; CHECK-NEXT: nop
+; CHECK:      movq %rax, %[[REG:r.+]]
+; CHECK:      callq *%r11
+; CHECK-NEXT: nop
+; CHECK:      movq %[[REG]], %rax
+; CHECK:      ret
+  %resolveCall2 = inttoptr i64 -559038736 to i8*
+  %result = tail call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 2, i32 15, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %resolveCall3 = inttoptr i64 -559038737 to i8*
+  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 3, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  ret i64 %result
+}
+
+; Caller frame metadata with stackmaps. This should not be optimized
+; as a leaf function.
+;
+; CHECK-LABEL: caller_meta_leaf
+; CHECK: subq $32, %rsp
+; CHECK: Ltmp
+; CHECK: addq $32, %rsp
+; CHECK: ret
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 4, i32 0, i64* %metadata)
+  ret void
+}
+
+; Test the webkit_jscc calling convention.
+; Two arguments will be pushed on the stack.
+; Return value in $rax.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      movq %r{{.+}}, 8(%rsp)
+; CHECK:      movq %r{{.+}}, (%rsp)
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+; CHECK:      movq %rax, 8(%rsp)
+; CHECK:      callq
+  %resolveCall2 = inttoptr i64 -559038736 to i8*
+  %result = tail call webkit_jscc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %resolveCall2, i32 2, i64 %p1, i64 %p2)
+  %resolveCall3 = inttoptr i64 -559038737 to i8*
+  tail call webkit_jscc void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 6, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  ret void
+}
+
+; Test patchpoints reusing the same TargetConstant.
+; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
+; There is no way to verify this, since it depends on memory allocation.
+; But I think it's useful to include as a working example.
+define i64 @testLowerConstant(i64 %arg, i64 %tmp2, i64 %tmp10, i64* %tmp33, i64 %tmp79) {
+entry:
+  %tmp80 = add i64 %tmp79, -16
+  %tmp81 = inttoptr i64 %tmp80 to i64*
+  %tmp82 = load i64* %tmp81, align 8
+  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 5, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 15, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  %tmp83 = load i64* %tmp33, align 8
+  %tmp84 = add i64 %tmp83, -24
+  %tmp85 = inttoptr i64 %tmp84 to i64*
+  %tmp86 = load i64* %tmp85, align 8
+  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 17, i32 5, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 18, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  ret i64 10
+}
+
+; Test small patchpoints that don't emit calls.
+define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: small_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: popq
+; CHECK-NEXT: ret
+  %result = tail call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i32, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/peep-vector-extract-concat.ll b/test/CodeGen/X86/peep-vector-extract-concat.ll
index 606a9be..f73ebb9 100644
--- a/test/CodeGen/X86/peep-vector-extract-concat.ll
+++ b/test/CodeGen/X86/peep-vector-extract-concat.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2,-sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse2,-sse4.1 | FileCheck %s
 ; CHECK: pshufd $3, %xmm0, %xmm0
 
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2,-sse41 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse2,-sse4.1 | FileCheck %s -check-prefix=WIN64
 ; %a is passed indirectly on Win64.
 ; WIN64: movss   12(%rcx), %xmm0
 
diff --git a/test/CodeGen/X86/pmovext.ll b/test/CodeGen/X86/pmovext.ll
index b85b4c3..f0e468f 100644
--- a/test/CodeGen/X86/pmovext.ll
+++ b/test/CodeGen/X86/pmovext.ll
@@ -18,5 +18,28 @@ define void @intrin_pmov(i16* noalias %dest, i8* noalias %src) nounwind uwtable
 }
 
 declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
-
 declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+
+; rdar://15245794
+
+define <4 x i32> @foo0(double %v.coerce) nounwind ssp {
+; CHECK-LABEL: foo0
+; CHECK: pmovzxwd %xmm0, %xmm0
+; CHECK-NEXT: ret
+  %tmp = bitcast double %v.coerce to <4 x i16>
+  %tmp1 = shufflevector <4 x i16> %tmp, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp2 = tail call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %tmp1) nounwind
+  ret <4 x i32> %tmp2
+}
+
+define <8 x i16> @foo1(double %v.coerce) nounwind ssp {
+; CHECK-LABEL: foo1
+; CHECK: pmovzxbw %xmm0, %xmm0
+; CHECK-NEXT: ret
+  %tmp = bitcast double %v.coerce to <8 x i8>
+  %tmp1 = shufflevector <8 x i8> %tmp, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %tmp2 = tail call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %tmp1)
+  ret <8 x i16> %tmp2
+}
+
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/pmovsx-inreg.ll b/test/CodeGen/X86/pmovsx-inreg.ll
index d30d7d0..07979f6 100644
--- a/test/CodeGen/X86/pmovsx-inreg.ll
+++ b/test/CodeGen/X86/pmovsx-inreg.ll
@@ -86,8 +86,7 @@ define void @test6(<16 x i8>* %in, <16 x i16>* %out) nounwind {
   ret void
 
 ; AVX2-LABEL: test6:
-; FIXME: v16i8 -> v16i16 is scalarized.
-; AVX2-NOT: pmovsx
+; AVX2: vpmovsxbw
 }
 
 define void @test7(<2 x i16>* %in, <2 x i64>* %out) nounwind {
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index da4af81..7bf8a61 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -mcpu=nehalem -stack-alignment=16 > %t
+; RUN: llc < %s -march=x86 -mattr=sse4.1 -mcpu=nehalem -stack-alignment=16 > %t
 ; RUN: grep pmul %t | count 12
-; RUN: grep mov %t | count 11
+; RUN: grep mov %t | count 14
 
 define <4 x i32> @a(<4 x i32> %i) nounwind  {
         %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 >
diff --git a/test/CodeGen/X86/pmulld.ll b/test/CodeGen/X86/pmulld.ll
index 4103eab..3db0f73 100644
--- a/test/CodeGen/X86/pmulld.ll
+++ b/test/CodeGen/X86/pmulld.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse41 -asm-verbose=0 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse41 -asm-verbose=0 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+sse4.1 -asm-verbose=0 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse4.1 -asm-verbose=0 | FileCheck %s -check-prefix=WIN64
 
 define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
 ; CHECK-LABEL: test1:
diff --git a/test/CodeGen/X86/pr10523.ll b/test/CodeGen/X86/pr10523.ll
index 7191d69..0ec22a0 100644
--- a/test/CodeGen/X86/pr10523.ll
+++ b/test/CodeGen/X86/pr10523.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse41
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
 
 ; No check in a crash test
 
diff --git a/test/CodeGen/X86/pr10524.ll b/test/CodeGen/X86/pr10524.ll
index ed3e7c5..12bdba9 100644
--- a/test/CodeGen/X86/pr10524.ll
+++ b/test/CodeGen/X86/pr10524.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse41
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
 
 ; No check in a crash test
 
diff --git a/test/CodeGen/X86/pr10525.ll b/test/CodeGen/X86/pr10525.ll
index 342c1d6..30ce297 100644
--- a/test/CodeGen/X86/pr10525.ll
+++ b/test/CodeGen/X86/pr10525.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse41
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
 
 ; No check in a crash test
 
diff --git a/test/CodeGen/X86/pr10526.ll b/test/CodeGen/X86/pr10526.ll
index 6963fe5..9fa83ce 100644
--- a/test/CodeGen/X86/pr10526.ll
+++ b/test/CodeGen/X86/pr10526.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse41
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,+sse4.1
 
 ; No check in a crash test
 
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
index 087b8d7..81aaf91 100644
--- a/test/CodeGen/X86/pr12312.ll
+++ b/test/CodeGen/X86/pr12312.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx < %s | FileCheck %s --check-prefix SSE41
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s | FileCheck %s --check-prefix AVX
 
 define i32 @veccond128(<4 x i32> %input) {
diff --git a/test/CodeGen/X86/pr14088.ll b/test/CodeGen/X86/pr14088.ll
index 505e3b5..16f20d0 100644
--- a/test/CodeGen/X86/pr14088.ll
+++ b/test/CodeGen/X86/pr14088.ll
@@ -19,7 +19,14 @@ return:
   ret i32 %retval.0
 }
 
-; We were miscompiling this and using %ax instead of %cx in the movw.
-; CHECK: movswl	%cx, %ecx
-; CHECK: movw	%cx, (%rsi)
-; CHECK: movslq	%ecx, %rcx
+; We were miscompiling this and using %ax instead of %cx in the movw
+; in the following sequence:
+;	movswl	%cx, %ecx
+;	movw	%cx, (%rsi)
+;	movslq	%ecx, %rcx
+;
+; We can't produce the above sequence without special SD-level
+; heuristics. Now we produce this:
+; CHECK: movw	%ax, (%rsi)
+; CHECK: cwtl
+; CHECK: cltq
diff --git a/test/CodeGen/X86/pr14090.ll b/test/CodeGen/X86/pr14090.ll
index d76b912..2f7c720 100644
--- a/test/CodeGen/X86/pr14090.ll
+++ b/test/CodeGen/X86/pr14090.ll
@@ -48,11 +48,11 @@ entry:
   %fifteen = bitcast i64* %retval.i.i to i32**
   %sixteen = bitcast i64* %retval.i.i to i8*
   call void @llvm.lifetime.start(i64 8, i8* %sixteen)
-  store i32* %.ph.i80, i32** %fifteen, align 8, !tbaa !0
+  store i32* %.ph.i80, i32** %fifteen, align 8
   %sunkaddr = ptrtoint i64* %retval.i.i to i32
   %sunkaddr86 = add i32 %sunkaddr, 4
   %sunkaddr87 = inttoptr i32 %sunkaddr86 to i32*
-  store i32 %fourteen, i32* %sunkaddr87, align 4, !tbaa !3
+  store i32 %fourteen, i32* %sunkaddr87, align 4
   %seventeen = load i64* %retval.i.i, align 8
   call void @llvm.lifetime.end(i64 8, i8* %sixteen)
   %eighteen = lshr i64 %seventeen, 32
@@ -68,9 +68,3 @@ entry:
 declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
 
 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"any pointer", metadata !1}
-!4 = metadata !{metadata !"vtable pointer", metadata !2}
diff --git a/test/CodeGen/X86/pr1505b.ll b/test/CodeGen/X86/pr1505b.ll
index 9b0ef83..c348fec 100644
--- a/test/CodeGen/X86/pr1505b.ll
+++ b/test/CodeGen/X86/pr1505b.ll
@@ -57,11 +57,10 @@ entry:
 	%tmp22 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZNSolsEd( %"struct.std::basic_ostream<char,std::char_traits<char> >"* %tmp16, double %tmp1920 )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=1]
 	%tmp30 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_( %"struct.std::basic_ostream<char,std::char_traits<char> >"* %tmp22 )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=0]
 ; reload:
-; CHECK: fld
-; CHECK: fstps
 ; CHECK: ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
 	%tmp34 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc( %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZSt4cout, i8* getelementptr ([13 x i8]* @.str1, i32 0, i32 0) )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=1]
 	%tmp3940 = fpext float %tmp1314 to double		; <double> [#uses=1]
+; CHECK: fld
 ; CHECK: fstpl
 ; CHECK: ZNSolsEd
 	%tmp42 = tail call %"struct.std::basic_ostream<char,std::char_traits<char> >"* @_ZNSolsEd( %"struct.std::basic_ostream<char,std::char_traits<char> >"* %tmp34, double %tmp3940 )		; <%"struct.std::basic_ostream<char,std::char_traits<char> >"*> [#uses=1]
diff --git a/test/CodeGen/X86/pr16031.ll b/test/CodeGen/X86/pr16031.ll
index ab0b5ef..ecf6218 100644
--- a/test/CodeGen/X86/pr16031.ll
+++ b/test/CodeGen/X86/pr16031.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=corei7-avx -enable-misched=false | FileCheck %s
 
 ; CHECK-LABEL: main:
 ; CHECK: pushl %esi
diff --git a/test/CodeGen/X86/pr16807.ll b/test/CodeGen/X86/pr16807.ll
new file mode 100644
index 0000000..6d55d99
--- /dev/null
+++ b/test/CodeGen/X86/pr16807.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core-avx-i | FileCheck %s
+
+define <16 x i16> @f_fu(<16 x i16> %bf) {
+allocas:
+  %avg.i.i = sdiv <16 x i16> %bf, <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>
+  ret <16 x i16> %avg.i.i
+}
+
+; CHECK: f_fu
+; CHECK: psraw
+; CHECK: psrlw
+; CHECK: paddw
+; CHECK: psraw
+; CHECK: psraw
+; CHECK: psrlw
+; CHECK: paddw
+; CHECK: psraw
+; CHECK: ret
diff --git a/test/CodeGen/X86/pr17546.ll b/test/CodeGen/X86/pr17546.ll
new file mode 100644
index 0000000..174fa5c
--- /dev/null
+++ b/test/CodeGen/X86/pr17546.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mcpu=core-avx2 | FileCheck %s
+
+define i32 @f_f___un_3C_unf_3E_un_3C_unf_3E_(<8 x i32> %__mask, i64 %BBBB) {
+  %QQQ = trunc i64 %BBBB to i32
+  %1 = extractelement <8 x i32> %__mask, i32 %QQQ
+  ret i32 %1
+}
+
+; CHECK: f_f___un_3C_unf_3E_un_3C_unf_3E_
+; CHECK: ret
diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll
new file mode 100644
index 0000000..98f951f
--- /dev/null
+++ b/test/CodeGen/X86/pr17631.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s
+
+%struct_type = type { [64 x <8 x float>], <8 x float> }
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>)
+
+; Function Attrs: nounwind
+define i32 @equal(<8 x i32> %A) {
+allocas:
+  %first_alloc  = alloca [64 x <8 x i32>]
+  %second_alloc = alloca %struct_type
+
+  %A1 = bitcast <8 x i32> %A to <8 x float>
+  %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1)
+  ret i32 %A2
+}
+
+; CHECK: equal
+; CHECK-NOT: vzeroupper
+; CHECK: _chkstk
+; CHECK: ret
+
+define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) {
+  %i = fptoui double %x to i64
+  store i64 %i, i64* %p
+  %ret = fadd <8 x float> %y, %y
+  ret <8 x float> %ret
+}
+
+; CHECK: foo
+; CHECK-NOT: vzeroupper
+; CHECK: _ftol2
+; CHECK: ret
diff --git a/test/CodeGen/X86/pr17764.ll b/test/CodeGen/X86/pr17764.ll
new file mode 100644
index 0000000..7a3fd6d
--- /dev/null
+++ b/test/CodeGen/X86/pr17764.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s
+
+define <16 x i16> @foo(<16 x i1> %mask, <16 x i16> %x, <16 x i16> %y) {
+  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> %y
+  ret <16 x i16> %ret
+}
+
+; CHECK: foo
+; CHECK: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; CHECK: ret
diff --git a/test/CodeGen/X86/pr18014.ll b/test/CodeGen/X86/pr18014.ll
new file mode 100644
index 0000000..e3860b8
--- /dev/null
+++ b/test/CodeGen/X86/pr18014.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=penryn | FileCheck %s
+
+; Ensure PSRAD is generated as the condition is consumed by both PADD and
+; BLENDVPS. PAND requires all bits setting properly.
+
+define <4 x i32> @foo(<4 x i32>* %p, <4 x i1> %cond, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+  %sext_cond = sext <4 x i1> %cond to <4 x i32>
+  %t1 = add <4 x i32> %v1, %sext_cond
+  %t2 = select <4 x i1> %cond, <4 x i32> %v1, <4 x i32> %v2
+  store <4 x i32> %t2, <4 x i32>* %p
+  ret <4 x i32> %t1
+; CHECK: foo
+; CHECK: pslld
+; CHECK: psrad
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr18023.ll b/test/CodeGen/X86/pr18023.ll
new file mode 100644
index 0000000..4c6f8cf
--- /dev/null
+++ b/test/CodeGen/X86/pr18023.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple x86_64-apple-macosx10.9.0 | FileCheck %s
+; PR18023
+
+; CHECK: movabsq $4294967296, %rcx
+; CHECK: movq  %rcx, (%rax)
+; CHECK: movl  $1, 4(%rax)
+; CHECK: movl  $0, 4(%rax)
+; CHECK: movq  $1, 4(%rax)
+
+@c = common global i32 0, align 4
+@a = common global [3 x i32] zeroinitializer, align 4
+@b = common global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+
+define void @func() {
+  store i32 1, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 1), align 4
+  store i32 0, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 0), align 4
+  %1 = load volatile i32* @b, align 4
+  store i32 1, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 1), align 4
+  store i32 0, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 1), align 4
+  %2 = load volatile i32* @b, align 4
+  store i32 1, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 1), align 4
+  store i32 0, i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 2), align 4
+  %3 = load volatile i32* @b, align 4
+  store i32 3, i32* @c, align 4
+  %4 = load i32* getelementptr inbounds ([3 x i32]* @a, i64 0, i64 1), align 4
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32 %4)
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/X86/pr18054.ll b/test/CodeGen/X86/pr18054.ll
new file mode 100644
index 0000000..b7af516
--- /dev/null
+++ b/test/CodeGen/X86/pr18054.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=penryn | FileCheck %s
+
+define void @foo(<16 x i32>* %p, <16 x i1> %x) {
+  %ret = sext <16 x i1> %x to <16 x i32>
+  store <16 x i32> %ret, <16 x i32>* %p
+  ret void
+; CHECK: foo
+; CHECK-NOT: pmovsxbd
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr18162.ll b/test/CodeGen/X86/pr18162.ll
new file mode 100644
index 0000000..523e47d
--- /dev/null
+++ b/test/CodeGen/X86/pr18162.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s
+
+; Make sure we are not crashing on this one.
+
+target triple = "x86_64-unknown-linux-gnu"
+
+%"Iterator" = type { i32* }
+
+declare { i64, <2 x float> } @Call() 
+declare { i64, <2 x float> }* @CallPtr() 
+
+define { i64, <2 x float> } @Foo(%"Iterator"* %this) {
+entry:
+  %retval = alloca i32
+  %this.addr = alloca %"Iterator"*
+  %this1 = load %"Iterator"** %this.addr
+  %bundle_ = getelementptr inbounds %"Iterator"* %this1, i32 0, i32 0
+  %0 = load i32** %bundle_
+  %1 = call { i64, <2 x float> } @Call()
+  %2 = call { i64, <2 x float> }* @CallPtr()
+  %3 = getelementptr { i64, <2 x float> }* %2, i32 0, i32 1
+  %4 = extractvalue { i64, <2 x float> } %1, 1
+  store <2 x float> %4, <2 x float>* %3
+  %5 = load { i64, <2 x float> }* %2
+  ret { i64, <2 x float> } %5
+}
+
diff --git a/test/CodeGen/X86/pre-ra-sched.ll b/test/CodeGen/X86/pre-ra-sched.ll
index b792ffa..70135d4 100644
--- a/test/CodeGen/X86/pre-ra-sched.ll
+++ b/test/CodeGen/X86/pre-ra-sched.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -debug-only=pre-RA-sched \
-; RUN:     2>&1 | FileCheck %s
+; RUN-disabled: llc < %s -mtriple=x86_64-apple-macosx -pre-RA-sched=ilp -debug-only=pre-RA-sched \
+; RUN-disabled:     2>&1 | FileCheck %s
+; RUN: true
 ; REQUIRES: asserts
 ;
 ; rdar:13279013: pre-RA-sched should not check all interferences and
diff --git a/test/CodeGen/X86/prefetch.ll b/test/CodeGen/X86/prefetch.ll
index efb5191..d6571ac 100644
--- a/test/CodeGen/X86/prefetch.ll
+++ b/test/CodeGen/X86/prefetch.ll
@@ -1,6 +1,9 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse | FileCheck %s
 ; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
 ; RUN: llc < %s -march=x86 -mattr=+sse -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHW
+; RUN: llc < %s -march=x86 -mcpu=slm | FileCheck %s -check-prefix=SLM
+; RUN: llc < %s -march=x86 -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHW
+; RUN: llc < %s -march=x86 -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=NOPRFCHW
 
 ; rdar://10538297
 
@@ -11,6 +14,8 @@ entry:
 ; CHECK: prefetcht0
 ; CHECK: prefetchnta
 ; PRFCHW: prefetchw
+; NOPRFCHW-NOT: prefetchw
+; SLM: prefetchw
 	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 1, i32 1 )
 	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 2, i32 1 )
 	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )
diff --git a/test/CodeGen/X86/prefixdata.ll b/test/CodeGen/X86/prefixdata.ll
new file mode 100644
index 0000000..2ec1892
--- /dev/null
+++ b/test/CodeGen/X86/prefixdata.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+@i = linkonce_odr global i32 1
+
+; CHECK: f:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: .long	1
+define void @f() prefix i32 1 {
+  ret void
+}
+
+; CHECK: g:
+; CHECK-NEXT: .cfi_startproc
+; CHECK-NEXT: .quad	i
+define void @g() prefix i32* @i {
+  ret void
+}
diff --git a/test/CodeGen/X86/rdrand.ll b/test/CodeGen/X86/rdrand.ll
index 1b16a2d..48182d0 100644
--- a/test/CodeGen/X86/rdrand.ll
+++ b/test/CodeGen/X86/rdrand.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrand | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i -mattr=+rdrnd | FileCheck %s
 declare {i16, i32} @llvm.x86.rdrand.16()
 declare {i32, i32} @llvm.x86.rdrand.32()
 declare {i64, i32} @llvm.x86.rdrand.64()
@@ -11,10 +11,10 @@ define i32 @_rdrand16_step(i16* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdrand16_step:
 ; CHECK: rdrandw	%ax
-; CHECK: movw	%ax, (%r[[A0:di|cx]])
 ; CHECK: movzwl	%ax, %ecx
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%ecx, %eax
+; CHECK: movw	%cx, (%r[[A0:di|cx]])
 ; CHECK: ret
 }
 
@@ -26,9 +26,9 @@ define i32 @_rdrand32_step(i32* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdrand32_step:
 ; CHECK: rdrandl	%e[[T0:[a-z]+]]
-; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T0]], %eax
+; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: ret
 }
 
@@ -40,9 +40,9 @@ define i32 @_rdrand64_step(i64* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdrand64_step:
 ; CHECK: rdrandq	%r[[T1:[a-z]+]]
-; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T1]], %eax
+; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/X86/rdseed.ll b/test/CodeGen/X86/rdseed.ll
index edc5069..c219b4a 100644
--- a/test/CodeGen/X86/rdseed.ll
+++ b/test/CodeGen/X86/rdseed.ll
@@ -12,10 +12,10 @@ define i32 @_rdseed16_step(i16* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdseed16_step:
 ; CHECK: rdseedw	%ax
-; CHECK: movw	%ax, (%r[[A0:di|cx]])
 ; CHECK: movzwl	%ax, %ecx
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%ecx, %eax
+; CHECK: movw	%cx, (%r[[A0:di|cx]])
 ; CHECK: ret
 }
 
@@ -27,9 +27,9 @@ define i32 @_rdseed32_step(i32* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdseed32_step:
 ; CHECK: rdseedl	%e[[T0:[a-z]+]]
-; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T0]], %eax
+; CHECK: movl	%e[[T0]], (%r[[A0]])
 ; CHECK: ret
 }
 
@@ -41,8 +41,8 @@ define i32 @_rdseed64_step(i64* %random_val) {
   ret i32 %isvalid
 ; CHECK-LABEL: _rdseed64_step:
 ; CHECK: rdseedq	%r[[T1:[a-z]+]]
-; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: movl	$1, %eax
 ; CHECK: cmovael	%e[[T1]], %eax
+; CHECK: movq	%r[[T1]], (%r[[A0]])
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/rem-2.ll b/test/CodeGen/X86/rem-2.ll
deleted file mode 100644
index 1b2af4b..0000000
--- a/test/CodeGen/X86/rem-2.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc < %s -march=x86 | not grep cltd
-
-define i32 @test(i32 %X) nounwind readnone {
-entry:
-	%0 = srem i32 41, %X
-	ret i32 %0
-}
diff --git a/test/CodeGen/X86/rem.ll b/test/CodeGen/X86/rem.ll
index 394070e..733b794 100644
--- a/test/CodeGen/X86/rem.ll
+++ b/test/CodeGen/X86/rem.ll
@@ -1,22 +1,37 @@
-; RUN: llc < %s -march=x86 | not grep div
+; RUN: llc < %s -march=x86 | FileCheck %s
 
+; CHECK-LABEL: test1:
+; CHECK-NOT: div
 define i32 @test1(i32 %X) {
         %tmp1 = srem i32 %X, 255                ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
+; CHECK-LABEL: test2:
+; CHECK-NOT: div
 define i32 @test2(i32 %X) {
         %tmp1 = srem i32 %X, 256                ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
+; CHECK-LABEL: test3:
+; CHECK-NOT: div
 define i32 @test3(i32 %X) {
         %tmp1 = urem i32 %X, 255                ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
+; CHECK-LABEL: test4:
+; CHECK-NOT: div
 define i32 @test4(i32 %X) {
         %tmp1 = urem i32 %X, 256                ; <i32> [#uses=1]
         ret i32 %tmp1
 }
 
+; CHECK-LABEL: test5:
+; CHECK-NOT: cltd
+define i32 @test5(i32 %X) nounwind readnone {
+entry:
+	%0 = srem i32 41, %X
+	ret i32 %0
+}
diff --git a/test/CodeGen/X86/rounding-ops.ll b/test/CodeGen/X86/rounding-ops.ll
index ace31cf..69f4bfb 100644
--- a/test/CodeGen/X86/rounding-ops.ll
+++ b/test/CodeGen/X86/rounding-ops.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse41 | FileCheck -check-prefix=CHECK-SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse4.1 | FileCheck -check-prefix=CHECK-SSE %s
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck -check-prefix=CHECK-AVX %s
 
 define float @test1(float %x) nounwind  {
diff --git a/test/CodeGen/X86/scalar_widen_div.ll b/test/CodeGen/X86/scalar_widen_div.ll
index e99ea93..5807d5b 100644
--- a/test/CodeGen/X86/scalar_widen_div.ll
+++ b/test/CodeGen/X86/scalar_widen_div.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse42 |  FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.2 |  FileCheck %s
 
 ; Verify when widening a divide/remainder operation, we only generate a
 ; divide/rem per element since divide/remainder can trap.
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index c2aa617..e170762 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -31,7 +31,7 @@ false:
 ; X32-NEXT: ret
 
 ; X32:      movl %esp, %eax
-; X32-NEXT: subl %ecx, %eax
+; X32:      subl %ecx, %eax
 ; X32-NEXT: cmpl %eax, %gs:48
 
 ; X32:      movl %eax, %esp
@@ -52,7 +52,7 @@ false:
 ; X64-NEXT: ret
 
 ; X64:      movq %rsp, %[[RDI:rdi|rax]]
-; X64-NEXT: subq %{{.*}}, %[[RDI]]
+; X64:      subq %{{.*}}, %[[RDI]]
 ; X64-NEXT: cmpq %[[RDI]], %fs:112
 
 ; X64:      movq %[[RDI]], %rsp
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 5fe2b70..cdd258d 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -34,12 +34,12 @@ bb90:		; preds = %bb84, %bb72
 bb91:		; preds = %bb84
 	ret i32 0
 ; CHECK-LABEL: test2:
-; CHECK: movnew
-; CHECK: movswl
+; CHECK: cmovnew
+; CHECK: cwtl
 
 ; ATOM-LABEL: test2:
-; ATOM: movnew
-; ATOM: movswl
+; ATOM: cmovnew
+; ATOM: cwtl
 }
 
 declare i1 @return_false()
@@ -256,8 +256,8 @@ entry:
   %call = tail call noalias i8* @_Znam(i64 %D) nounwind noredzone
   ret i8* %call
 ; CHECK-LABEL: test12:
-; CHECK: movq $-1, %[[R:r..]]
 ; CHECK: mulq
+; CHECK: movq $-1, %[[R:r..]]
 ; CHECK: cmovnoq	%rax, %[[R]]
 ; CHECK: jmp	__Znam
 
diff --git a/test/CodeGen/X86/setcc-narrowing.ll b/test/CodeGen/X86/setcc-narrowing.ll
new file mode 100644
index 0000000..25cb2c8
--- /dev/null
+++ b/test/CodeGen/X86/setcc-narrowing.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s
+; PR17338
+
+@t1.global = internal global i64 -1, align 8
+
+define i32 @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: cmpl	$0, _t1.global
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: movzbl %al, %eax
+; CHECK-NEXT: ret
+  %0 = load i64* @t1.global, align 8
+  %and = and i64 4294967295, %0
+  %cmp = icmp sgt i64 %and, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/X86/setcc-sentinals.ll b/test/CodeGen/X86/setcc-sentinals.ll
new file mode 100644
index 0000000..d36e678
--- /dev/null
+++ b/test/CodeGen/X86/setcc-sentinals.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mcpu=generic -march=x86-64 -asm-verbose=false | FileCheck %s
+
+define zeroext i1 @test0(i64 %x) nounwind {
+; CHECK-LABEL: test0:
+; CHECK-NEXT: incq %[[X:rdi|rcx]]
+; CHECK-NEXT: cmpq $1, %[[X]]
+; CHECK-NEXT: seta %al
+; CHECK-NEXT: ret
+  %cmp1 = icmp ne i64 %x, -1
+  %not.cmp = icmp ne i64 %x, 0
+  %.cmp1 = and i1 %cmp1, %not.cmp
+  ret i1 %.cmp1
+}
diff --git a/test/CodeGen/X86/sha.ll b/test/CodeGen/X86/sha.ll
new file mode 100644
index 0000000..bf81e99
--- /dev/null
+++ b/test/CodeGen/X86/sha.ll
@@ -0,0 +1,139 @@
+; RUN: llc < %s -mattr=+sha -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-unknown-unknown
+
+declare <4 x i32> @llvm.x86.sha1rnds4(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <4 x i32> @test_sha1rnds4rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1rnds4(<4 x i32> %a, <4 x i32> %b, i8 3)
+  ret <4 x i32> %0
+  ; CHECK: test_sha1rnds4rr
+  ; CHECK: sha1rnds4 $3, %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha1rnds4rm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1rnds4(<4 x i32> %a, <4 x i32> %0, i8 3)
+  ret <4 x i32> %1
+  ; CHECK: test_sha1rnds4rm
+  ; CHECK: sha1rnds4 $3, (%rdi), %xmm0
+}
+
+declare <4 x i32> @llvm.x86.sha1nexte(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha1nexterr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1nexte(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+  ; CHECK: test_sha1nexterr
+  ; CHECK: sha1nexte %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha1nexterm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1nexte(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+  ; CHECK: test_sha1nexterm
+  ; CHECK: sha1nexte (%rdi), %xmm0
+}
+
+declare <4 x i32> @llvm.x86.sha1msg1(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha1msg1rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1msg1(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+  ; CHECK: test_sha1msg1rr
+  ; CHECK: sha1msg1 %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha1msg1rm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1msg1(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+  ; CHECK: test_sha1msg1rm
+  ; CHECK: sha1msg1 (%rdi), %xmm0
+}
+
+declare <4 x i32> @llvm.x86.sha1msg2(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha1msg2rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha1msg2(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+  ; CHECK: test_sha1msg2rr
+  ; CHECK: sha1msg2 %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha1msg2rm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha1msg2(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+  ; CHECK: test_sha1msg2rm
+  ; CHECK: sha1msg2 (%rdi), %xmm0
+}
+
+declare <4 x i32> @llvm.x86.sha256rnds2(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha256rnds2rr(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
+  ret <4 x i32> %0
+  ; CHECK: test_sha256rnds2rr
+  ; CHECK: movaps %xmm0, [[XMM_TMP1:%xmm[1-9][0-9]?]]
+  ; CHECK: movaps %xmm2, %xmm0
+  ; CHECK: sha256rnds2 %xmm1, [[XMM_TMP1]]
+}
+
+define <4 x i32> @test_sha256rnds2rm(<4 x i32> %a, <4 x i32>* %b, <4 x i32> %c) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a, <4 x i32> %0, <4 x i32> %c)
+  ret <4 x i32> %1
+  ; CHECK: test_sha256rnds2rm
+  ; CHECK: movaps %xmm0, [[XMM_TMP2:%xmm[1-9][0-9]?]]
+  ; CHECK: movaps %xmm1, %xmm0
+  ; CHECK: sha256rnds2 (%rdi), [[XMM_TMP2]]
+}
+
+declare <4 x i32> @llvm.x86.sha256msg1(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha256msg1rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha256msg1(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+  ; CHECK: test_sha256msg1rr
+  ; CHECK: sha256msg1 %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha256msg1rm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha256msg1(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+  ; CHECK: test_sha256msg1rm
+  ; CHECK: sha256msg1 (%rdi), %xmm0
+}
+
+declare <4 x i32> @llvm.x86.sha256msg2(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @test_sha256msg2rr(<4 x i32> %a, <4 x i32> %b) nounwind uwtable {
+entry:
+  %0 = tail call <4 x i32> @llvm.x86.sha256msg2(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+  ; CHECK: test_sha256msg2rr
+  ; CHECK: sha256msg2 %xmm1, %xmm0
+}
+
+define <4 x i32> @test_sha256msg2rm(<4 x i32> %a, <4 x i32>* %b) nounwind uwtable {
+entry:
+  %0 = load <4 x i32>* %b
+  %1 = tail call <4 x i32> @llvm.x86.sha256msg2(<4 x i32> %a, <4 x i32> %0)
+  ret <4 x i32> %1
+  ; CHECK: test_sha256msg2rm
+  ; CHECK: sha256msg2 (%rdi), %xmm0
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/shift-bmi2.ll b/test/CodeGen/X86/shift-bmi2.ll
index 0116789..7615754 100644
--- a/test/CodeGen/X86/shift-bmi2.ll
+++ b/test/CodeGen/X86/shift-bmi2.ll
@@ -30,10 +30,11 @@ entry:
   %x = load i32* %p
   %shl = shl i32 %x, %shamt
 ; BMI2: shl32p
-; BMI2: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; Source order scheduling prevents folding, rdar:14208996.
+; BMI2: shlxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI2: ret
 ; BMI264: shl32p
-; BMI264: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shlxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i32 %shl
 }
@@ -74,7 +75,7 @@ entry:
   %x = load i64* %p
   %shl = shl i64 %x, %shamt
 ; BMI264: shl64p
-; BMI264: shlxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shlxq %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i64 %shl
 }
@@ -106,10 +107,11 @@ entry:
   %x = load i32* %p
   %shl = lshr i32 %x, %shamt
 ; BMI2: lshr32p
-; BMI2: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; Source order scheduling prevents folding, rdar:14208996.
+; BMI2: shrxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI2: ret
 ; BMI264: lshr32p
-; BMI264: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shrxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i32 %shl
 }
@@ -128,7 +130,7 @@ entry:
   %x = load i64* %p
   %shl = lshr i64 %x, %shamt
 ; BMI264: lshr64p
-; BMI264: shrxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: shrxq %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i64 %shl
 }
@@ -150,10 +152,11 @@ entry:
   %x = load i32* %p
   %shl = ashr i32 %x, %shamt
 ; BMI2: ashr32p
-; BMI2: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; Source order scheduling prevents folding, rdar:14208996.
+; BMI2: sarxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI2: ret
 ; BMI264: ashr32p
-; BMI264: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: sarxl %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i32 %shl
 }
@@ -172,7 +175,7 @@ entry:
   %x = load i64* %p
   %shl = ashr i64 %x, %shamt
 ; BMI264: ashr64p
-; BMI264: sarxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: sarxq %{{.+}}, %{{.+}}, %{{.+}}
 ; BMI264: ret
   ret i64 %shl
 }
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index 7b774f6..589e9ec 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -106,10 +106,10 @@ declare i32 @bar2(i32, i32, i32)
 define signext i16 @t8() nounwind ssp {
 entry:
 ; 32-LABEL: t8:
-; 32: calll {{_?}}bar3
+; 32: jmp {{_?}}bar3
 
 ; 64-LABEL: t8:
-; 64: callq {{_?}}bar3
+; 64: jmp {{_?}}bar3
   %0 = tail call signext i16 @bar3() nounwind      ; <i16> [#uses=1]
   ret i16 %0
 }
@@ -122,7 +122,7 @@ entry:
 ; 32: calll *
 
 ; 64-LABEL: t9:
-; 64: callq *
+; 64: jmpq *
   %0 = bitcast i32 (i32)* %x to i16 (i32)*
   %1 = tail call signext i16 %0(i32 0) nounwind
   ret i16 %1
diff --git a/test/CodeGen/X86/sink-hoist.ll b/test/CodeGen/X86/sink-hoist.ll
index 0741635..64f5311 100644
--- a/test/CodeGen/X86/sink-hoist.ll
+++ b/test/CodeGen/X86/sink-hoist.ll
@@ -26,11 +26,10 @@ define double @foo(double %x, double %y, i1 %c) nounwind {
 
 ; CHECK-LABEL: split:
 ; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: jne
-; CHECK-NEXT: movaps
-; CHECK-NEXT: ret
+; CHECK-NEXT: je
 ; CHECK:      divsd
-; CHECK-NEXT: ret
+; CHECK:      movaps
+; CHECK:      ret
 define double @split(double %x, double %y, i1 %c) nounwind {
   %a = fdiv double %x, 3.2
   %z = select i1 %c, double %a, double %y
@@ -65,7 +64,7 @@ return:
 ; Sink instructions with dead EFLAGS defs.
 
 ; FIXME: Unfail the zzz test if we can correctly mark pregs with the kill flag.
-; 
+;
 ; See <rdar://problem/8030636>. This test isn't valid after we made machine
 ; sinking more conservative about sinking instructions that define a preg into a
 ; block when we don't know if the preg is killed within the current block.
diff --git a/test/CodeGen/X86/sqrt-fastmath.ll b/test/CodeGen/X86/sqrt-fastmath.ll
index 9b5179e..fc79e31 100644
--- a/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/test/CodeGen/X86/sqrt-fastmath.ll
@@ -55,6 +55,6 @@ entry:
 ; Function Attrs: nounwind readnone
 declare x86_fp80 @__sqrtl_finite(x86_fp80) #1
 
-attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll
new file mode 100644
index 0000000..65d44bf
--- /dev/null
+++ b/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -0,0 +1,308 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse | FileCheck %s
+
+define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: addss
+  %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: cmpordps
+  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: cmpordss
+  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: sete
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: setae
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: seta
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: setbe
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: sbb
+  %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: comiss
+  ; CHECK: setne
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_cvtsi2ss(<4 x float> %a0) {
+  ; CHECK: movl
+  ; CHECK: cvtsi2ss
+  %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
+
+
+define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) {
+  ; CHECK: cvtss2si
+  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) {
+  ; CHECK: cvttss2si
+  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_div_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: divss
+  %res = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_ldmxcsr(i8* %a0) {
+  ; CHECK: movl
+  ; CHECK: ldmxcsr
+  call void @llvm.x86.sse.ldmxcsr(i8* %a0)
+  ret void
+}
+declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind
+
+
+
+define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: maxps
+  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: maxss
+  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: minps
+  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: minss
+  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) {
+  ; CHECK: movmskps
+  %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+
+
+define <4 x float> @test_x86_sse_mul_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: mulss
+  %res = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
+  ; CHECK: rcpps
+  %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) {
+  ; CHECK: rcpss
+  %res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
+  ; CHECK: rsqrtps
+  %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {
+  ; CHECK: rsqrtss
+  %res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
+  ; CHECK: sqrtps
+  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
+  ; CHECK: sqrtss
+  %res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_stmxcsr(i8* %a0) {
+  ; CHECK: movl
+  ; CHECK: stmxcsr
+  call void @llvm.x86.sse.stmxcsr(i8* %a0)
+  ret void
+}
+declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
+
+
+define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
+  ; CHECK: movl
+  ; CHECK: movups
+  call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
+  ret void
+}
+declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
+
+
+define <4 x float> @test_x86_sse_sub_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: subss
+  %res = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: sete
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: setae
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: seta
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: setbe
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: sbbl
+  %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: ucomiss
+  ; CHECK: setne
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll
index 30a0fbe..1ac9832 100644
--- a/test/CodeGen/X86/sse2-blend.ll
+++ b/test/CodeGen/X86/sse2-blend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
 
 ; CHECK: vsel_float
 ; CHECK: pandn
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
new file mode 100644
index 0000000..ff6c10b
--- /dev/null
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -0,0 +1,712 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse2 | FileCheck %s
+
+define <2 x double> @test_x86_sse2_add_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: addsd
+  %res = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: cmpordpd
+  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: cmpordsd
+  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: sete
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: setae
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: seta
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: setbe
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: sbbl    %eax, %eax
+  ; CHECK: andl    $1, %eax
+  %res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: comisd
+  ; CHECK: setne
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
+  ; CHECK: cvtdq2pd
+  %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse2_cvtdq2ps(<4 x i32> %a0) {
+  ; CHECK: cvtdq2ps
+  %res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_cvtpd2dq(<2 x double> %a0) {
+  ; CHECK: cvtpd2dq
+  %res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse2_cvtpd2ps(<2 x double> %a0) {
+  ; CHECK: cvtpd2ps
+  %res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_cvtps2dq(<4 x float> %a0) {
+  ; CHECK: cvtps2dq
+  %res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
+  ; CHECK: cvtps2pd
+  %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse2_cvtsd2si(<2 x double> %a0) {
+  ; CHECK: cvtsd2si
+  %res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) {
+  ; CHECK: cvtsd2ss 
+  ; CHECK-NOT: cvtsd2ss %xmm{{[0-9]+}}, %xmm{{[0-9]+}}, %xmm{{[0-9]+}} 
+  %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0) {
+  ; CHECK: movl
+  ; CHECK: cvtsi2sd
+  %res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) {
+  ; CHECK: cvtss2sd
+  %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
+  ; CHECK: cvttpd2dq
+  %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
+  ; CHECK: cvttps2dq
+  %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
+
+
+define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
+  ; CHECK: cvttsd2si
+  %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: divsd
+  %res = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+
+define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: maxpd
+  %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: maxsd
+  %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: minpd
+  %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: minsd
+  %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_movmsk_pd(<2 x double> %a0) {
+  ; CHECK: movmskpd
+  %res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
+
+
+
+
+define <2 x double> @test_x86_sse2_mul_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: test_x86_sse2_mul_sd
+  ; CHECK: mulsd
+  %res = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: packssdw
+  %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: packsswb
+  %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: packuswb
+  %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: paddsb
+  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: paddsw
+  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: paddusb
+  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: paddusw
+  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pavgb
+  %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pavgw
+  %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmaddwd
+  %res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pmaxs_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmaxsw
+  %res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_pmaxu_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pmaxub
+  %res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pmins_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pminsw
+  %res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_pminu_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pminub
+  %res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define i32 @test_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
+  ; CHECK: pmovmskb
+  %res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmulhw
+  %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmulhuw
+  %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pmuludq
+  %res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: psadbw
+  %res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pslld
+  %res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+  ; CHECK: pslldq
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+  ; CHECK: pslldq
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: psllq
+  %res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psllw
+  %res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_pslli_d(<4 x i32> %a0) {
+  ; CHECK: pslld
+  %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_pslli_q(<2 x i64> %a0) {
+  ; CHECK: psllq
+  %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_pslli_w(<8 x i16> %a0) {
+  ; CHECK: psllw
+  %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: psrad
+  %res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psraw
+  %res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_psrai_d(<4 x i32> %a0) {
+  ; CHECK: psrad
+  %res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psrai_w(<8 x i16> %a0) {
+  ; CHECK: psraw
+  %res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: psrld
+  %res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+  ; CHECK: psrldq
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+  ; CHECK: psrldq
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: psrlq
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psrlw
+  %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) {
+  ; CHECK: psrld
+  %res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrli_q(<2 x i64> %a0) {
+  ; CHECK: psrlq
+  %res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psrli_w(<8 x i16> %a0) {
+  ; CHECK: psrlw
+  %res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: psubsb
+  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psubsw
+  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: psubusb
+  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psubusw
+  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
+  ; CHECK: sqrtpd
+  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
+  ; CHECK: sqrtsd
+  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+
+define void @test_x86_sse2_storel_dq(i8* %a0, <4 x i32> %a1) {
+  ; CHECK: test_x86_sse2_storel_dq
+  ; CHECK: movl
+  ; CHECK: movq
+  call void @llvm.x86.sse2.storel.dq(i8* %a0, <4 x i32> %a1)
+  ret void
+}
+declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind
+
+
+define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
+  ; CHECK: test_x86_sse2_storeu_dq
+  ; CHECK: movl
+  ; CHECK: movdqu
+  ; add operation forces the execution domain.
+  %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
+  ret void
+}
+declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+
+
+define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
+  ; CHECK: test_x86_sse2_storeu_pd
+  ; CHECK: movl
+  ; CHECK: movupd
+  ; fadd operation forces the execution domain.
+  %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
+  call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
+  ret void
+}
+declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
+
+
+define <2 x double> @test_x86_sse2_sub_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: test_x86_sse2_sub_sd
+  ; CHECK: subsd
+  %res = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: sete
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: setae
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: seta
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: setbe
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: sbbl
+  %res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: ucomisd
+  ; CHECK: setne
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/sse2-vector-shifts.ll b/test/CodeGen/X86/sse2-vector-shifts.ll
index e2d6125..462def9 100644
--- a/test/CodeGen/X86/sse2-vector-shifts.ll
+++ b/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -121,7 +121,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_sraw_3:
-; CHECK: psraw   $16, %xmm0
+; CHECK: psraw   $15, %xmm0
 ; CHECK-NEXT: ret
 
 define <4 x i32> @test_srad_1(<4 x i32> %InVec) {
@@ -151,7 +151,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srad_3:
-; CHECK: psrad   $32, %xmm0
+; CHECK: psrad   $31, %xmm0
 ; CHECK-NEXT: ret
 
 ; SSE Logical Shift Right
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index 217139a..9147c22 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -7,7 +7,7 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 2, i32 1 >
 	store <2 x double> %tmp9, <2 x double>* %r, align 16
 	ret void
-        
+
 ; CHECK-LABEL: test1:
 ; CHECK: 	movl	8(%esp), %eax
 ; CHECK-NEXT: 	movapd	(%eax), %xmm0
@@ -23,12 +23,12 @@ define void @test2(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
 	%tmp9 = shufflevector <2 x double> %tmp3, <2 x double> %tmp7, <2 x i32> < i32 0, i32 2 >
 	store <2 x double> %tmp9, <2 x double>* %r, align 16
 	ret void
-        
+
 ; CHECK-LABEL: test2:
-; CHECK: 	movl	8(%esp), %eax
-; CHECK-NEXT: 	movapd	(%eax), %xmm0
+; CHECK: 	movl	4(%esp), %eax
+; CHECK: 	movl	8(%esp), %ecx
+; CHECK-NEXT: 	movapd	(%ecx), %xmm0
 ; CHECK-NEXT: 	movhpd	12(%esp), %xmm0
-; CHECK-NEXT: 	movl	4(%esp), %eax
 ; CHECK-NEXT: 	movapd	%xmm0, (%eax)
 ; CHECK-NEXT: 	ret
 }
@@ -48,7 +48,7 @@ define void @test3(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B) nounwind
 	store <4 x float> %tmp13, <4 x float>* %res
 	ret void
 ; CHECK: @test3
-; CHECK: 	unpcklps	
+; CHECK: 	unpcklps
 }
 
 define void @test4(<4 x float> %X, <4 x float>* %res) nounwind {
@@ -85,9 +85,9 @@ define void @test6(<4 x float>* %res, <4 x float>* %A) nounwind {
         %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> < i32 0, i32 5, i32 6, i32 7 >          ; <<4 x float>> [#uses=1]
         store <4 x float> %tmp2, <4 x float>* %res
         ret void
-        
+
 ; CHECK-LABEL: test6:
-; CHECK: 	movaps	(%eax), %xmm0
+; CHECK: 	movaps	(%ecx), %xmm0
 ; CHECK:	movaps	%xmm0, (%eax)
 }
 
@@ -96,7 +96,7 @@ define void @test7() nounwind {
         shufflevector <4 x float> %1, <4 x float> zeroinitializer, <4 x i32> zeroinitializer         ; <<4 x float>>:2 [#uses=1]
         store <4 x float> %2, <4 x float>* null
         ret void
-        
+
 ; CHECK-LABEL: test7:
 ; CHECK:	xorps	%xmm0, %xmm0
 ; CHECK:	movaps	%xmm0, 0
@@ -166,7 +166,7 @@ define void @test13(<4 x float>* %res, <4 x float>* %A, <4 x float>* %B, <4 x fl
         store <4 x float> %tmp11, <4 x float>* %res
         ret void
 ; CHECK: test13
-; CHECK: shufps	$69, (%eax), %xmm0
+; CHECK: shufps	$69, (%ecx), %xmm0
 ; CHECK: pshufd	$-40, %xmm0, %xmm0
 }
 
@@ -178,8 +178,8 @@ define <4 x float> @test14(<4 x float>* %x, <4 x float>* %y) nounwind {
         %tmp27 = shufflevector <4 x float> %tmp9, <4 x float> %tmp21, <4 x i32> < i32 0, i32 1, i32 4, i32 5 >                ; <<4 x float>> [#uses=1]
         ret <4 x float> %tmp27
 ; CHECK-LABEL: test14:
-; CHECK: 	subps	[[X1:%xmm[0-9]+]], [[X2:%xmm[0-9]+]]
-; CHECK: 	addps	[[X1]], [[X0:%xmm[0-9]+]]
+; CHECK: 	addps	[[X1:%xmm[0-9]+]], [[X0:%xmm[0-9]+]]
+; CHECK: 	subps	[[X1]], [[X2:%xmm[0-9]+]]
 ; CHECK: 	movlhps	[[X2]], [[X0]]
 }
 
@@ -221,4 +221,3 @@ entry:
  %double2float.i = fptrunc <4 x double> %0 to <4 x float>
  ret <4 x float> %double2float.i
 }
-
diff --git a/test/CodeGen/X86/sse3-intrinsics-x86.ll b/test/CodeGen/X86/sse3-intrinsics-x86.ll
new file mode 100644
index 0000000..dbd14b8
--- /dev/null
+++ b/test/CodeGen/X86/sse3-intrinsics-x86.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse3 | FileCheck %s
+
+define <2 x double> @test_x86_sse3_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: addsubpd
+  %res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse3_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: addsubps
+  %res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse3_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: haddpd
+  %res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse3_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: haddps
+  %res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse3_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: hsubpd
+  %res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse3_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: hsubps
+  %res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse3_ldu_dq(i8* %a0) {
+  ; CHECK: movl
+  ; CHECK: lddqu
+  %res = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index bd92d22..a32f5de 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 ;CHECK-LABEL: vsel_float:
 ;CHECK: blendvps
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll
new file mode 100644
index 0000000..37eff43
--- /dev/null
+++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -0,0 +1,326 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.1 | FileCheck %s
+
+define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: blendpd
+  %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: blendps
+  %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+  ; CHECK: blendvpd
+  %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+  ; CHECK: blendvps
+  %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: dppd
+  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: dpps
+  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: insertps
+  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+
+
+define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: mpsadbw
+  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: packusdw
+  %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+  ; CHECK: pblendvb
+  %res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pblendw
+  %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 7) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
+  ; CHECK: phminposuw
+  %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse41_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pmaxsb
+  %res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pmaxsd
+  %res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pmaxud
+  %res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmaxuw
+  %res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse41_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pminsb
+  %res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pminsd
+  %res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pminud(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pminud
+  %res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pminuw
+  %res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
+  ; CHECK: pmovsxbd
+  %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
+  ; CHECK: pmovsxbq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
+  ; CHECK: pmovsxbw
+  %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
+  ; CHECK: pmovsxdq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
+  ; CHECK: pmovsxwd
+  %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
+  ; CHECK: pmovsxwq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
+  ; CHECK: pmovzxbd
+  %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
+  ; CHECK: pmovzxbq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
+  ; CHECK: pmovzxbw
+  %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
+  ; CHECK: pmovzxdq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
+
+
+define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
+  ; CHECK: pmovzxwd
+  %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
+  ; CHECK: pmovzxwq
+  %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: pmuldq
+  %res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: ptest 
+  ; CHECK: sbbl
+  %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: ptest 
+  ; CHECK: seta
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) {
+  ; CHECK: ptest 
+  ; CHECK: sete
+  ; CHECK: movzbl
+  %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) {
+  ; CHECK: roundpd
+  %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) {
+  ; CHECK: roundps
+  %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+
+define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) {
+  ; CHECK: roundsd
+  %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+
+define <4 x float> @test_x86_sse41_round_ss(<4 x float> %a0, <4 x float> %a1) {
+  ; CHECK: roundss
+  %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 87b64e5..c15e24c 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse41 -mcpu=penryn | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse41 -mcpu=penryn | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
 
 @g16 = external global i16
 
diff --git a/test/CodeGen/X86/sse42-intrinsics-x86.ll b/test/CodeGen/X86/sse42-intrinsics-x86.ll
new file mode 100644
index 0000000..5ca8009
--- /dev/null
+++ b/test/CodeGen/X86/sse42-intrinsics-x86.ll
@@ -0,0 +1,182 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.2 | FileCheck %s
+
+define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: pcmpestri $7
+  ; CHECK: movl
+  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: pcmpestri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a2
+  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
+define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestri
+  ; CHECK: seta
+  %res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestri
+  ; CHECK: sbbl
+  %res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestri
+  ; CHECK: seto
+  %res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestri
+  ; CHECK: sets
+  %res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestri
+  ; CHECK: sete
+  %res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
+  ; CHECK: movl
+  ; CHECK: movl
+  ; CHECK: pcmpestrm
+  ; CHECK-NOT: vmov
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: pcmpestrm $7,
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a2
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
+define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri $7
+  ; CHECK: movl
+  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
+  ; CHECK: pcmpistri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a1
+  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
+define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri
+  ; CHECK: seta
+  %res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri
+  ; CHECK: sbbl
+  %res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri
+  ; CHECK: seto
+  %res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri
+  ; CHECK: sets
+  %res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistri
+  ; CHECK: sete
+  %res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pcmpistrm $7
+  ; CHECK-NOT: vmov
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
+
+
+define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
+  ; CHECK: pcmpistrm $7, (
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a1
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
diff --git a/test/CodeGen/X86/sse42.ll b/test/CodeGen/X86/sse42.ll
index c787523..db51d99 100644
--- a/test/CodeGen/X86/sse42.ll
+++ b/test/CodeGen/X86/sse42.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.2 | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.2 | FileCheck %s -check-prefix=X64
 
 declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
 declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
diff --git a/test/CodeGen/X86/sse42_64.ll b/test/CodeGen/X86/sse42_64.ll
index 8b3a69b..b39e76c 100644
--- a/test/CodeGen/X86/sse42_64.ll
+++ b/test/CodeGen/X86/sse42_64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse42 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.2 | FileCheck %s -check-prefix=X64
 
 declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind
 declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
diff --git a/test/CodeGen/X86/ssse3-intrinsics-x86.ll b/test/CodeGen/X86/ssse3-intrinsics-x86.ll
new file mode 100644
index 0000000..728cbc9
--- /dev/null
+++ b/test/CodeGen/X86/ssse3-intrinsics-x86.ll
@@ -0,0 +1,120 @@
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+ssse3 | FileCheck %s
+
+define <16 x i8> @test_x86_ssse3_pabs_b_128(<16 x i8> %a0) {
+  ; CHECK: pabsb
+  %res = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
+
+
+define <4 x i32> @test_x86_ssse3_pabs_d_128(<4 x i32> %a0) {
+  ; CHECK: pabsd
+  %res = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_pabs_w_128(<8 x i16> %a0) {
+  ; CHECK: pabsw
+  %res = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_ssse3_phadd_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: phaddd
+  %res = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_phadd_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: phaddsw
+  %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_phadd_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: phaddw
+  %res = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <4 x i32> @test_x86_ssse3_phsub_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: phsubd
+  %res = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_phsub_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: phsubsw
+  %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_phsub_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: phsubw
+  %res = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pmaddubsw
+  %res = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_pmul_hr_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: pmulhrsw
+  %res = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <16 x i8> @test_x86_ssse3_pshuf_b_128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: pshufb
+  %res = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <16 x i8> @test_x86_ssse3_psign_b_128(<16 x i8> %a0, <16 x i8> %a1) {
+  ; CHECK: psignb
+  %res = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
+
+
+define <4 x i32> @test_x86_ssse3_psign_d_128(<4 x i32> %a0, <4 x i32> %a1) {
+  ; CHECK: psignd
+  %res = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i16> @test_x86_ssse3_psign_w_128(<8 x i16> %a0, <8 x i16> %a1) {
+  ; CHECK: psignw
+  %res = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
new file mode 100644
index 0000000..bd27ac3
--- /dev/null
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -0,0 +1,97 @@
+; RUN: llc -mtriple=x86_64-apple-darwin < %s -o -
+
+; PR16954
+;
+; Make sure that when we splice off the end of a machine basic block, we include
+; DBG_VALUE MI in the terminator sequence.
+
+@a = external global { i64, [56 x i8] }, align 32
+
+; Function Attrs: nounwind sspreq
+define i32 @_Z18read_response_sizev() #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !22, i64 0, metadata !23), !dbg !39
+  %0 = load i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0), align 8, !dbg !40
+  tail call void @llvm.dbg.value(metadata !63, i64 0, metadata !64), !dbg !71
+  %1 = trunc i64 %0 to i32
+  ret i32 %1
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata)
+
+attributes #0 = { sspreq }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!21, !72}
+
+!0 = metadata !{i32 786449, metadata !1, i32 4, metadata !"clang version 3.4 ", i1 true, metadata !"", i32 0, metadata !2, metadata !5, metadata !8, metadata !20, metadata !5, metadata !""} ; [ DW_TAG_compile_unit ] [/Users/matt/ryan_bug/<unknown>] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !"<unknown>", metadata !"/Users/matt/ryan_bug"}
+!2 = metadata !{metadata !3}
+!3 = metadata !{i32 786436, metadata !1, metadata !4, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
+!4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 19, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
+!5 = metadata !{i32 0}
+!6 = metadata !{metadata !7}
+!7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"read_response_size", metadata !"read_response_size", metadata !"_Z18read_response_sizev", i32 27, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z18read_response_sizev, null, null, metadata !14, i32 27} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
+!10 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
+!11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{metadata !15, metadata !19}
+!15 = metadata !{i32 786688, metadata !9, metadata !"b", metadata !10, i32 28, metadata !16, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [b] [line 28]
+!16 = metadata !{i32 786451, metadata !1, null, metadata !"B", i32 16, i64 32, i64 32, i32 0, i32 0, null, metadata !17, i32 0, null, null} ; [ DW_TAG_structure_type ] [B] [line 16, size 32, align 32, offset 0] [def] [from ]
+!17 = metadata !{metadata !18}
+!18 = metadata !{i32 786445, metadata !1, metadata !16, metadata !"end_of_file", i32 17, i64 32, i64 32, i64 0, i32 0, metadata !13} ; [ DW_TAG_member ] [end_of_file] [line 17, size 32, align 32, offset 0] [from int]
+!19 = metadata !{i32 786688, metadata !9, metadata !"c", metadata !10, i32 29, metadata !13, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [c] [line 29]
+!20 = metadata !{}
+!21 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!22 = metadata !{i64* getelementptr inbounds ({ i64, [56 x i8] }* @a, i32 0, i32 0)}
+!23 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, metadata !38} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!24 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long>", metadata !"min<unsigned long long>", metadata !"_ZN3__13minIyEERKT_S3_RS1_", i32 12, metadata !27, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !33, null, metadata !35, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [min<unsigned long long>]
+!25 = metadata !{i32 786489, metadata !26, null, metadata !"__1", i32 1} ; [ DW_TAG_namespace ] [__1] [line 1]
+!26 = metadata !{metadata !"main.cpp", metadata !"/Users/matt/ryan_bug"}
+!27 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !28, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!28 = metadata !{metadata !29, metadata !29, metadata !32}
+!29 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !30} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!30 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!31 = metadata !{i32 786468, null, null, metadata !"long long unsigned int", i32 0, i64 64, i64 64, i64 0, i32 0, i32 7} ; [ DW_TAG_base_type ] [long long unsigned int] [line 0, size 64, align 64, offset 0, enc DW_ATE_unsigned]
+!32 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !31} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from long long unsigned int]
+!33 = metadata !{metadata !34}
+!34 = metadata !{i32 786479, null, metadata !"_Tp", metadata !31, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!35 = metadata !{metadata !36, metadata !37}
+!36 = metadata !{i32 786689, metadata !24, metadata !"p1", metadata !10, i32 16777228, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 12]
+!37 = metadata !{i32 786689, metadata !24, metadata !"p2", metadata !10, i32 33554444, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 12]
+!38 = metadata !{i32 33, i32 0, metadata !9, null}
+!39 = metadata !{i32 12, i32 0, metadata !24, metadata !38}
+!40 = metadata !{i32 9, i32 0, metadata !41, metadata !59}
+!41 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"min<unsigned long long, __1::A>", metadata !"min<unsigned long long, __1::A>", metadata !"_ZN3__13minIyNS_1AEEERKT_S4_RS2_T0_", i32 7, metadata !42, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, metadata !53, null, metadata !55, i32 8} ; [ DW_TAG_subprogram ] [line 7] [def] [scope 8] [min<unsigned long long, __1::A>]
+!42 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !43, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!43 = metadata !{metadata !29, metadata !29, metadata !32, metadata !44}
+!44 = metadata !{i32 786451, metadata !1, metadata !25, metadata !"A", i32 0, i64 8, i64 8, i32 0, i32 0, null, metadata !45, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [A] [line 0, size 8, align 8, offset 0] [def] [from ]
+!45 = metadata !{metadata !46}
+!46 = metadata !{i32 786478, metadata !1, metadata !44, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !52, i32 1} ; [ DW_TAG_subprogram ] [line 1] [operator()]
+!47 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !48, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!48 = metadata !{metadata !13, metadata !49, metadata !50, metadata !50}
+!49 = metadata !{i32 786447, i32 0, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [artificial] [from A]
+!50 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !51} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!51 = metadata !{i32 786470, null, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, metadata !13} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from int]
+!52 = metadata !{i32 786468}
+!53 = metadata !{metadata !34, metadata !54}
+!54 = metadata !{i32 786479, null, metadata !"_Compare", metadata !44, null, i32 0, i32 0} ; [ DW_TAG_template_type_parameter ]
+!55 = metadata !{metadata !56, metadata !57, metadata !58}
+!56 = metadata !{i32 786689, metadata !41, metadata !"p1", metadata !10, i32 16777223, metadata !29, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 7]
+!57 = metadata !{i32 786689, metadata !41, metadata !"p2", metadata !10, i32 33554439, metadata !32, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p2] [line 7]
+!58 = metadata !{i32 786689, metadata !41, metadata !"p3", metadata !10, i32 50331656, metadata !44, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p3] [line 8]
+!59 = metadata !{i32 13, i32 0, metadata !24, metadata !38}
+!63 = metadata !{i32 undef}
+!64 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, metadata !40} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!65 = metadata !{i32 786478, metadata !1, metadata !25, metadata !"operator()", metadata !"operator()", metadata !"_ZN3__11AclERKiS2_", i32 1, metadata !47, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, null, null, metadata !46, metadata !66, i32 2} ; [ DW_TAG_subprogram ] [line 1] [def] [scope 2] [operator()]
+!66 = metadata !{metadata !67, metadata !69, metadata !70}
+!67 = metadata !{i32 786689, metadata !65, metadata !"this", null, i32 16777216, metadata !68, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!68 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, metadata !44} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!69 = metadata !{i32 786689, metadata !65, metadata !"p1", metadata !10, i32 33554433, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p1] [line 1]
+!70 = metadata !{i32 786689, metadata !65, metadata !"", metadata !10, i32 50331650, metadata !50, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [line 2]
+!71 = metadata !{i32 1, i32 0, metadata !65, metadata !40}
+!72 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll b/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
new file mode 100644
index 0000000..7d499f9
--- /dev/null
+++ b/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple i386-unknown-freebsd10.0 -march=x86 --relocation-model=pic %s -o -
+
+; PR16979
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-unknown-freebsd10.0"
+
+@state = internal unnamed_addr global i32 0, align 4
+
+; Function Attrs: nounwind sspreq
+define void @set_state(i32 %s) #0 {
+entry:
+  store i32 %s, i32* @state, align 4
+  ret void
+}
+
+; Function Attrs: nounwind sspreq
+define void @zero_char(i8* nocapture %p) #0 {
+entry:
+  store i8 0, i8* %p, align 1
+  tail call void @g(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) #2
+  ret void
+}
+
+declare void @g(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind sspreq
+define void @do_something(i32 %i) #0 {
+entry:
+  %data = alloca [8 x i8], align 1
+  %0 = load i32* @state, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  tail call fastcc void @send_int(i32 0)
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  tail call fastcc void @send_int(i32 %i)
+  %arrayidx = getelementptr inbounds [8 x i8]* %data, i32 0, i32 0
+  call void @zero_char(i8* %arrayidx)
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; Function Attrs: nounwind sspreq
+define internal fastcc void @send_int(i32 %p) #0 {
+entry:
+  tail call void @f(i32 %p) #2
+  tail call void @g(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) #2
+  ret void
+}
+
+declare void @f(i32) #1
+
+attributes #0 = { nounwind sspreq "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/X86/stack-protector.ll b/test/CodeGen/X86/stack-protector.ll
index a4dbbb9..265ec80 100644
--- a/test/CodeGen/X86/stack-protector.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -2313,18 +2313,22 @@ entry:
 ; LINUX-I386-LABEL: test19d:
 ; LINUX-I386: mov{{l|q}} %gs:
 ; LINUX-I386: calll __stack_chk_fail
+; LINUX-I386-NOT: calll __stack_chk_fail
 
 ; LINUX-X64-LABEL: test19d:
 ; LINUX-X64: mov{{l|q}} %fs:
 ; LINUX-X64: callq __stack_chk_fail
+; LINUX-X64-NOT: callq __stack_chk_fail
 
 ; LINUX-KERNEL-X64-LABEL: test19d:
 ; LINUX-KERNEL-X64: mov{{l|q}} %gs:
 ; LINUX-KERNEL-X64: callq __stack_chk_fail
+; LINUX-KERNEL-X64-NOT: callq ___stack_chk_fail
 
 ; DARWIN-X64-LABEL: test19d:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
+; DARWIN-X64-NOT: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %exn.slot = alloca i8*
   %ehselector.slot = alloca i32
diff --git a/test/CodeGen/X86/stackmap.ll b/test/CodeGen/X86/stackmap.ll
new file mode 100644
index 0000000..ed95583
--- /dev/null
+++ b/test/CodeGen/X86/stackmap.ll
@@ -0,0 +1,292 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck %s
+;
+; Note: Print verbose stackmaps using -debug-only=stackmaps.
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; CHECK-NEXT:   .long   0
+; Num LargeConstants
+; CHECK-NEXT:   .long   1
+; CHECK-NEXT:   .quad   4294967296
+; Num Callsites
+; CHECK-NEXT:   .long   11
+
+; Constant arguments
+;
+; CHECK-NEXT:   .long   1
+; CHECK-NEXT:   .long   L{{.*}}-_constantargs
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  4
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65535
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65536
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; LargeConstant at index 0
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+
+define void @constantargs() {
+entry:
+  %0 = inttoptr i64 12345 to i8*
+  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 1, i32 15, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  ret void
+}
+
+; Inline OSR Exit
+;
+; CHECK-NEXT:   .long   3
+; CHECK-NEXT:   .long   L{{.*}}-_osrinline
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrinline(i64 %a, i64 %b) {
+entry:
+  ; Runtime void->void call.
+  call void inttoptr (i64 -559038737 to void ()*)()
+  ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
+  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 3, i32 12, i64 %a, i64 %b)
+  ret void
+}
+
+; Cold OSR Exit
+;
+; 2 live variables in register.
+;
+; CHECK-NEXT:   .long  4
+; CHECK-NEXT:   .long   L{{.*}}-_osrcold
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrcold(i64 %a, i64 %b) {
+entry:
+  %test = icmp slt i64 %a, %b
+  br i1 %test, label %ret, label %cold
+cold:
+  ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
+  %thunk = inttoptr i64 -559038737 to i8*
+  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 4, i32 15, i8* %thunk, i32 0, i64 %a, i64 %b)
+  unreachable
+ret:
+  ret void
+}
+
+; Property Read
+; CHECK-NEXT:  .long  5
+; CHECK-NEXT:   .long   L{{.*}}-_propertyRead
+; CHECK-NEXT:  .short  0
+; CHECK-NEXT:  .short  0
+;
+; FIXME: There are currently no stackmap entries. After moving to
+; AnyRegCC, we will have entries for the object and return value.
+define i64 @propertyRead(i64* %obj) {
+entry:
+  %resolveRead = inttoptr i64 -559038737 to i8*
+  %result = call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %resolveRead, i32 1, i64* %obj)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Property Write
+; CHECK-NEXT:  .long  6
+; CHECK-NEXT:   .long   L{{.*}}-_propertyWrite
+; CHECK-NEXT:  .short  0
+; CHECK-NEXT:  .short  0
+;
+; FIXME: There are currently no stackmap entries. After moving to
+; AnyRegCC, we will have entries for the object and return value.
+define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
+entry:
+  %resolveWrite = inttoptr i64 -559038737 to i8*
+  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 6, i32 15, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  ret void
+}
+
+; Void JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-NEXT:   .long  7
+; CHECK-NEXT:   .long   L{{.*}}-_jsVoidCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 -559038737 to i8*
+  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 7, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  ret void
+}
+
+; i64 JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK:        .long  8
+; CHECK-NEXT:   .long   L{{.*}}-_jsIntCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 -559038737 to i8*
+  %result = call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 8, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Spilled stack map values.
+;
+; Verify 17 stack map entries.
+;
+; CHECK:      .long 11
+; CHECK-NEXT: .long L{{.*}}-_spilledValue
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 17
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect RBP + ...
+; CHECK:      .byte 3
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 6
+define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
+entry:
+  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 11, i32 15, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  ret void
+}
+
+; Spilled stack map values.
+;
+; Verify 17 stack map entries.
+;
+; CHECK:       .long 12
+; CHECK-LABEL: .long L{{.*}}-_spilledStackMapValue
+; CHECK-NEXT:  .short 0
+; CHECK-NEXT:  .short 17
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect RBP + ...
+; CHECK:      .byte 3
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .short 6
+define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
+entry:
+  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 12, i32 15, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  ret void
+}
+
+; Spill a subregister stackmap operand.
+;
+; CHECK:       .long 13
+; CHECK-LABEL: .long L{{.*}}-_spillSubReg
+; CHECK-NEXT:  .short 0
+; 4 locations
+; CHECK-NEXT:  .short 1
+;
+; Check that the subregister operand is a 4-byte spill.
+; Location: Indirect, 4-byte, RBP + ...
+; CHECK:      .byte 3
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .short 6
+define void @spillSubReg(i64 %arg) #0 {
+bb:
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  unreachable
+
+bb2:
+  %tmp = load i64* inttoptr (i64 140685446136880 to i64*)
+  br i1 undef, label %bb16, label %bb17
+
+bb16:
+  unreachable
+
+bb17:
+  %tmp32 = trunc i64 %tmp to i32
+  br i1 undef, label %bb60, label %bb61
+
+bb60:
+  tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
+  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 13, i32 5, i32 %tmp32)
+  unreachable
+
+bb61:
+  unreachable
+}
+
+; Map a single byte subregister. There is no DWARF register number, so
+; we expect the register to be encoded with the proper size and spill offset. We don't know which
+;
+; CHECK:       .long 14
+; CHECK-LABEL: .long L{{.*}}-_subRegOffset
+; CHECK-NEXT:  .short 0
+; 2 locations
+; CHECK-NEXT:  .short 2
+;
+; Check that the subregister operands are 1-byte spills.
+; Location 0: Register, 4-byte, AL
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .long 0
+;
+; Location 1: Register, 4-byte, BL
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .short 3
+; CHECK-NEXT: .long 0
+define void @subRegOffset(i16 %arg) {
+  %v = mul i16 %arg, 5
+  %a0 = trunc i16 %v to i8
+  tail call void asm sideeffect "nop", "~{bx}"() nounwind
+  %arghi = lshr i16 %v, 8
+  %a1 = trunc i16 %arghi to i8
+  tail call void asm sideeffect "nop", "~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
+  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 5, i8 %a0, i8 %a1)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i32, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/store-narrow.ll b/test/CodeGen/X86/store-narrow.ll
index fab266f..7557f25 100644
--- a/test/CodeGen/X86/store-narrow.ll
+++ b/test/CodeGen/X86/store-narrow.ll
@@ -12,7 +12,7 @@ entry:
   %D = or i32 %C, %B
   store i32 %D, i32* %a0, align 4
   ret void
-  
+
 ; X64-LABEL: test1:
 ; X64: movb	%sil, (%rdi)
 
@@ -34,8 +34,8 @@ entry:
 ; X64: movb	%sil, 1(%rdi)
 
 ; X32-LABEL: test2:
-; X32: movb	8(%esp), %al
-; X32: movb	%al, 1(%{{.*}})
+; X32: movb	8(%esp), %[[REG:[abcd]l]]
+; X32: movb	%[[REG]], 1(%{{.*}})
 }
 
 define void @test3(i32* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@@ -67,8 +67,8 @@ entry:
 ; X64: movw	%si, 2(%rdi)
 
 ; X32-LABEL: test4:
-; X32: movl	8(%esp), %eax
-; X32: movw	%ax, 2(%{{.*}})
+; X32: movl	8(%esp), %e[[REG:[abcd]x]]
+; X32: movw	%[[REG]], 2(%{{.*}})
 }
 
 define void @test5(i64* nocapture %a0, i16 zeroext %a1) nounwind ssp {
@@ -84,8 +84,8 @@ entry:
 ; X64: movw	%si, 2(%rdi)
 
 ; X32-LABEL: test5:
-; X32: movzwl	8(%esp), %eax
-; X32: movw	%ax, 2(%{{.*}})
+; X32: movzwl	8(%esp), %e[[REG:[abcd]x]]
+; X32: movw	%[[REG]], 2(%{{.*}})
 }
 
 define void @test6(i64* nocapture %a0, i8 zeroext %a1) nounwind ssp {
@@ -102,8 +102,8 @@ entry:
 
 
 ; X32-LABEL: test6:
-; X32: movb	8(%esp), %al
-; X32: movb	%al, 5(%{{.*}})
+; X32: movb	8(%esp), %[[REG:[abcd]l]]
+; X32: movb	%[[REG]], 5(%{{.*}})
 }
 
 define i32 @test7(i64* nocapture %a0, i8 zeroext %a1, i32* %P2) nounwind {
@@ -121,8 +121,8 @@ entry:
 
 
 ; X32-LABEL: test7:
-; X32: movb	8(%esp), %cl
-; X32: movb	%cl, 5(%{{.*}})
+; X32: movb	8(%esp), %[[REG:[abcd]l]]
+; X32: movb	%[[REG]], 5(%{{.*}})
 }
 
 ; PR7833
diff --git a/test/CodeGen/X86/tail-call-attrs.ll b/test/CodeGen/X86/tail-call-attrs.ll
new file mode 100644
index 0000000..17ebe99
--- /dev/null
+++ b/test/CodeGen/X86/tail-call-attrs.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -o - %s | FileCheck %s
+
+; Simple case: completely identical returns, even with extensions, shouldn't be
+; a barrier to tail calls.
+declare zeroext i1 @give_bool()
+define zeroext i1 @test_bool() {
+; CHECK-LABEL: test_bool:
+; CHECK: jmp
+  %call = tail call zeroext i1 @give_bool()
+  ret i1 %call
+}
+
+; Here, there's more zero extension to be done between the call and the return,
+; so a tail call is impossible (well, according to current Clang practice
+; anyway. The AMD64 ABI isn't crystal clear on the matter).
+declare zeroext i32 @give_i32()
+define zeroext i8 @test_i32() {
+; CHECK-LABEL: test_i32:
+; CHECK: callq _give_i32
+; CHECK: movzbl %al, %eax
+; CHECK: ret
+
+  %call = tail call zeroext i32 @give_i32()
+  %val = trunc i32 %call to i8
+  ret i8 %val
+}
+
+; Here, one function is zeroext and the other is signext. To the extent that
+; these both mean something they are incompatible so no tail call is possible.
+declare zeroext i16 @give_unsigned_i16()
+define signext i16 @test_incompatible_i16() {
+; CHECK-LABEL: test_incompatible_i16:
+; CHECK: callq _give_unsigned_i16
+; CHECK: cwtl
+; CHECK: ret
+
+  %call = tail call zeroext i16 @give_unsigned_i16()
+  ret i16 %call
+}
+
+declare inreg i32 @give_i32_inreg()
+define i32 @test_inreg_to_normal() {
+; CHECK-LABEL: test_inreg_to_normal:
+; CHECK: callq _give_i32_inreg
+; CHECK: ret
+  %val = tail call inreg i32 @give_i32_inreg()
+  ret i32 %val
+}
+
+define inreg i32 @test_normal_to_inreg() {
+; CHECK-LABEL: test_normal_to_inreg:
+; CHECK: callq _give_i32
+; CHECK: ret
+  %val = tail call i32 @give_i32()
+  ret i32 %val
+}
diff --git a/test/CodeGen/X86/tailcall-largecode.ll b/test/CodeGen/X86/tailcall-largecode.ll
index e9b8721..f5662d9 100644
--- a/test/CodeGen/X86/tailcall-largecode.ll
+++ b/test/CodeGen/X86/tailcall-largecode.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -tailcallopt -code-model=large | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -tailcallopt -code-model=large -enable-misched=false | FileCheck %s
 
 declare fastcc i32 @callee(i32 %arg)
 define fastcc i32 @directcall(i32 %arg) {
diff --git a/test/CodeGen/X86/tbm-intrinsics-x86_64.ll b/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
new file mode 100644
index 0000000..1bc6175
--- /dev/null
+++ b/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+tbm < %s | FileCheck %s
+
+define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %a, i32 2814)
+  ret i32 %0
+}
+
+declare i32 @llvm.x86.tbm.bextri.u32(i32, i32) nounwind readnone
+
+define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind readonly {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %tmp1 = load i32* %a, align 4
+  %0 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %tmp1, i32 2814)
+  ret i32 %0
+}
+
+define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %a, i64 2814)
+  ret i64 %0
+}
+
+declare i64 @llvm.x86.tbm.bextri.u64(i64, i64) nounwind readnone
+
+define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind readonly {
+entry:
+  ; CHECK-LABEl: test_x86_tbm_bextri_u64_m:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %tmp1 = load i64* %a, align 8
+  %0 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %tmp1, i64 2814)
+  ret i64 %0
+}
diff --git a/test/CodeGen/X86/tbm_patterns.ll b/test/CodeGen/X86/tbm_patterns.ll
new file mode 100644
index 0000000..79eea10
--- /dev/null
+++ b/test/CodeGen/X86/tbm_patterns.ll
@@ -0,0 +1,253 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+tbm < %s | FileCheck %s
+
+define i32 @test_x86_tbm_bextri_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = lshr i32 %a, 4
+  %1 = and i32 %0, 4095
+  ret i32 %1
+}
+
+define i32 @test_x86_tbm_bextri_u32_m(i32* nocapture %a) nounwind readonly {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u32_m:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = load i32* %a
+  %1 = lshr i32 %0, 4
+  %2 = and i32 %1, 4095
+  ret i32 %2
+}
+
+define i64 @test_x86_tbm_bextri_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = lshr i64 %a, 4
+  %1 = and i64 %0, 4095
+  ret i64 %1
+}
+
+define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind readonly {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
+  ; CHECK-NOT: mov
+  ; CHECK: bextr $
+  %0 = load i64* %a
+  %1 = lshr i64 %0, 4
+  %2 = and i64 %1, 4095
+  ret i64 %2
+}
+
+define i32 @test_x86_tbm_blcfill_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcfill_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blcfill %
+  %0 = add i32 %a, 1
+  %1 = and i32 %0, %a
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_blcfill_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcfill_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blcfill %
+  %0 = add i64 %a, 1
+  %1 = and i64 %0, %a
+  ret i64 %1
+}
+
+define i32 @test_x86_tbm_blci_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blci_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blci %
+  %0 = add i32 1, %a
+  %1 = xor i32 %0, -1
+  %2 = or i32 %1, %a
+  ret i32 %2
+}
+
+define i64 @test_x86_tbm_blci_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blci_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blci %
+  %0 = add i64 1, %a
+  %1 = xor i64 %0, -1
+  %2 = or i64 %1, %a
+  ret i64 %2
+}
+
+define i32 @test_x86_tbm_blci_u32_b(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blci_u32_b:
+  ; CHECK-NOT: mov
+  ; CHECK: blci %
+  %0 = sub i32 -2, %a
+  %1 = or i32 %0, %a
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_blci_u64_b(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blci_u64_b:
+  ; CHECK-NOT: mov
+  ; CHECK: blci %
+  %0 = sub i64 -2, %a
+  %1 = or i64 %0, %a
+  ret i64 %1
+}
+
+define i32 @test_x86_tbm_blcic_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcic_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blcic %
+  %0 = xor i32 %a, -1
+  %1 = add i32 %a, 1
+  %2 = and i32 %1, %0
+  ret i32 %2
+}
+
+define i64 @test_x86_tbm_blcic_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcic_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blcic %
+  %0 = xor i64 %a, -1
+  %1 = add i64 %a, 1
+  %2 = and i64 %1, %0
+  ret i64 %2
+}
+
+define i32 @test_x86_tbm_blcmsk_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcmsk_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blcmsk %
+  %0 = add i32 %a, 1
+  %1 = xor i32 %0, %a
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_blcmsk_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcmsk_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blcmsk %
+  %0 = add i64 %a, 1
+  %1 = xor i64 %0, %a
+  ret i64 %1
+}
+
+define i32 @test_x86_tbm_blcs_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcs_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blcs %
+  %0 = add i32 %a, 1
+  %1 = or i32 %0, %a
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_blcs_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blcs_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blcs %
+  %0 = add i64 %a, 1
+  %1 = or i64 %0, %a
+  ret i64 %1
+}
+
+define i32 @test_x86_tbm_blsfill_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blsfill_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blsfill %
+  %0 = add i32 %a, -1
+  %1 = or i32 %0, %a
+  ret i32 %1
+}
+
+define i64 @test_x86_tbm_blsfill_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blsfill_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blsfill %
+  %0 = add i64 %a, -1
+  %1 = or i64 %0, %a
+  ret i64 %1
+}
+
+define i32 @test_x86_tbm_blsic_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blsic_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: blsic %
+  %0 = xor i32 %a, -1
+  %1 = add i32 %a, -1
+  %2 = or i32 %0, %1
+  ret i32 %2
+}
+
+define i64 @test_x86_tbm_blsic_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_blsic_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: blsic %
+  %0 = xor i64 %a, -1
+  %1 = add i64 %a, -1
+  %2 = or i64 %0, %1
+  ret i64 %2
+}
+
+define i32 @test_x86_tbm_t1mskc_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_t1mskc_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: t1mskc %
+  %0 = xor i32 %a, -1
+  %1 = add i32 %a, 1
+  %2 = or i32 %0, %1
+  ret i32 %2
+}
+
+define i64 @Ttest_x86_tbm_t1mskc_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_t1mskc_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: t1mskc %
+  %0 = xor i64 %a, -1
+  %1 = add i64 %a, 1
+  %2 = or i64 %0, %1
+  ret i64 %2
+}
+
+define i32 @test_x86_tbm_tzmsk_u32(i32 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_tzmsk_u32:
+  ; CHECK-NOT: mov
+  ; CHECK: tzmsk %
+  %0 = xor i32 %a, -1
+  %1 = add i32 %a, -1
+  %2 = and i32 %0, %1
+  ret i32 %2
+}
+
+define i64 @test_x86_tbm_tzmsk_u64(i64 %a) nounwind readnone {
+entry:
+  ; CHECK-LABEL: test_x86_tbm_tzmsk_u64:
+  ; CHECK-NOT: mov
+  ; CHECK: tzmsk %
+  %0 = xor i64 %a, -1
+  %1 = add i64 %a, -1
+  %2 = and i64 %0, %1
+  ret i64 %2
+}
diff --git a/test/CodeGen/X86/test-nofold.ll b/test/CodeGen/X86/test-nofold.ll
index 97db1b3..19fbaaf 100644
--- a/test/CodeGen/X86/test-nofold.ll
+++ b/test/CodeGen/X86/test-nofold.ll
@@ -2,10 +2,10 @@
 ; rdar://5752025
 
 ; We want:
-;      CHECK: movl	$42, %ecx
-; CHECK-NEXT: movl	4(%esp), %eax
-; CHECK-NEXT: andl	$15, %eax
-; CHECK-NEXT: cmovnel	%ecx, %eax
+;      CHECK: movl	4(%esp), %ecx
+; CHECK-NEXT: andl	$15, %ecx
+; CHECK-NEXT: movl	$42, %eax
+; CHECK-NEXT: cmovel	%ecx, %eax
 ; CHECK-NEXT: ret
 ;
 ; We don't want:
@@ -39,4 +39,3 @@ entry:
 	%retval = select i1 %tmp4, i32 %tmp2, i32 42		; <i32> [#uses=1]
 	ret i32 %retval
 }
-
diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll
index 24284e5..76a8402 100644
--- a/test/CodeGen/X86/tls.ll
+++ b/test/CodeGen/X86/tls.ll
@@ -223,27 +223,22 @@ entry:
 define i16 @f11() {
 ; X32_LINUX-LABEL: f11:
 ; X32_LINUX:      movzwl %gs:s1@NTPOFF, %eax
-; Why is this kill line here, but no where else?
-; X32_LINUX-NEXT: # kill
-; X32_LINUX-NEXT: ret
+; X32_LINUX:      ret
 ; X64_LINUX-LABEL: f11:
 ; X64_LINUX:      movzwl %fs:s1@TPOFF, %eax
-; X64_LINUX-NEXT: # kill
-; X64_LINUX-NEXT: ret
+; X64_LINUX:      ret
 ; X32_WIN-LABEL: f11:
 ; X32_WIN:      movl __tls_index, %eax
 ; X32_WIN-NEXT: movl %fs:__tls_array, %ecx
 ; X32_WIN-NEXT: movl (%ecx,%eax,4), %eax
 ; X32_WIN-NEXT: movzwl _s1@SECREL32(%eax), %eax
-; X32_WIN-NEXT: # kill
-; X32_WIN-NEXT: ret
+; X32_WIN:      ret
 ; X64_WIN-LABEL: f11:
 ; X64_WIN:      movl _tls_index(%rip), %eax
 ; X64_WIN-NEXT: movq %gs:88, %rcx
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movzwl s1@SECREL32(%rax), %eax
-; X64_WIN-NEXT: # kill
-; X64_WIN-NEXT: ret
+; X64_WIN:      ret
 
 entry:
 	%tmp1 = load i16* @s1
diff --git a/test/CodeGen/X86/tlv-3.ll b/test/CodeGen/X86/tlv-3.ll
new file mode 100644
index 0000000..4f79305
--- /dev/null
+++ b/test/CodeGen/X86/tlv-3.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple x86_64-apple-darwin | FileCheck %s
+; PR17964
+
+; CHECK: __DATA,__thread_data,thread_local_regular
+; CHECK: _foo$tlv$init
+@foo = weak_odr thread_local global i8 1, align 4
+
+define i32 @main() {
+    ret i32 0
+}
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index 408bdc8..d230f1f 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 ;CHECK-LABEL: load_2_i8:
 ; A single 16-bit load
diff --git a/test/CodeGen/X86/trunc-to-bool.ll b/test/CodeGen/X86/trunc-to-bool.ll
index 3711cf1..0ed6347 100644
--- a/test/CodeGen/X86/trunc-to-bool.ll
+++ b/test/CodeGen/X86/trunc-to-bool.ll
@@ -22,7 +22,7 @@ ret_false:
     ret i1 false
 }
 ; CHECK-LABEL: test2:
-; CHECK: btl %eax
+; CHECK: btl
 
 define i32 @test3(i8* %ptr) nounwind {
     %val = load i8* %ptr
diff --git a/test/CodeGen/X86/unaligned-spill-folding.ll b/test/CodeGen/X86/unaligned-spill-folding.ll
new file mode 100644
index 0000000..154ce9e
--- /dev/null
+++ b/test/CodeGen/X86/unaligned-spill-folding.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=i386-unknown-freebsd -mcpu=core2 -stack-alignment=4 -relocation-model=pic < %s | FileCheck %s -check-prefix=UNALIGNED
+; RUN: llc -mtriple=i386-unknown-freebsd -mcpu=core2 -stack-alignment=16 -relocation-model=pic < %s | FileCheck %s -check-prefix=ALIGNED
+; RUN: llc -mtriple=i386-unknown-freebsd -mcpu=core2 -stack-alignment=4 -force-align-stack -relocation-model=pic < %s | FileCheck %s -check-prefix=FORCEALIGNED
+
+@arr = internal unnamed_addr global [32 x i32] zeroinitializer, align 16
+
+; PR12250
+define i32 @test1() {
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds [32 x i32]* @arr, i32 0, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>* %1, align 16
+  %2 = add nsw <4 x i32> %wide.load, <i32 10, i32 10, i32 10, i32 10>
+  %3 = xor <4 x i32> %2, <i32 123345, i32 123345, i32 123345, i32 123345>
+  %4 = add nsw <4 x i32> %3, <i32 112, i32 112, i32 112, i32 112>
+  %5 = xor <4 x i32> %4, <i32 543345, i32 543345, i32 543345, i32 543345>
+  %6 = add nsw <4 x i32> %5, <i32 73, i32 73, i32 73, i32 73>
+  %7 = xor <4 x i32> %6, <i32 345987, i32 345987, i32 345987, i32 345987>
+  %8 = add nsw <4 x i32> %7, <i32 48, i32 48, i32 48, i32 48>
+  %9 = xor <4 x i32> %8, <i32 123987, i32 123987, i32 123987, i32 123987>
+  store <4 x i32> %9, <4 x i32>* %1, align 16
+  %index.next = add i32 %index, 4
+  %10 = icmp eq i32 %index.next, 32
+  br i1 %10, label %middle.block, label %vector.body
+
+middle.block:
+  ret i32 0
+
+; We can't fold the spill into a padd unless the stack is aligned. Just spilling
+; doesn't force stack realignment though
+; UNALIGNED-LABEL: @test1
+; UNALIGNED-NOT: andl $-{{..}}, %esp
+; UNALIGNED: movdqu {{.*}} # 16-byte Folded Spill
+; UNALIGNED-NOT: paddd {{.*}} # 16-byte Folded Reload
+
+; ALIGNED-LABEL: @test1
+; ALIGNED-NOT: andl $-{{..}}, %esp
+; ALIGNED: movdqa {{.*}} # 16-byte Spill
+; ALIGNED: paddd {{.*}} # 16-byte Folded Reload
+
+; FORCEALIGNED-LABEL: @test1
+; FORCEALIGNED: andl $-{{..}}, %esp
+; FORCEALIGNED: movdqa {{.*}} # 16-byte Spill
+; FORCEALIGNED: paddd {{.*}} # 16-byte Folded Reload
+}
diff --git a/test/CodeGen/X86/unknown-location.ll b/test/CodeGen/X86/unknown-location.ll
index 2422de9..d7ae469 100644
--- a/test/CodeGen/X86/unknown-location.ll
+++ b/test/CodeGen/X86/unknown-location.ll
@@ -19,12 +19,13 @@ entry:
 }
 
 !llvm.dbg.cu = !{!3}
+!llvm.module.flags = !{!12}
 
 !0 = metadata !{i32 786689, metadata !1, metadata !"x", metadata !2, i32 1, metadata !6} ; [ DW_TAG_arg_variable ]
 !1 = metadata !{i32 786478, metadata !10, metadata !2, metadata !"foo", metadata !"foo", metadata !"foo", i32 1, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 false, i32 (i32, i32, i32, i32)* @foo, null, null, null, i32 1} ; [ DW_TAG_subprogram ]
 !2 = metadata !{i32 786473, metadata !10} ; [ DW_TAG_file_type ]
 !3 = metadata !{i32 786449, metadata !10, i32 12, metadata !"producer", i1 false, metadata !"", i32 0, metadata !11, metadata !11, metadata !9, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null} ; [ DW_TAG_subroutine_type ]
+!4 = metadata !{i32 786453, metadata !10, metadata !2, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
 !5 = metadata !{metadata !6}
 !6 = metadata !{i32 786468, metadata !10, metadata !2, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !7 = metadata !{i32 786443, metadata !2, metadata !1, i32 1, i32 30, i32 0} ; [ DW_TAG_lexical_block ]
@@ -32,3 +33,4 @@ entry:
 !9 = metadata !{metadata !1}
 !10 = metadata !{metadata !"test.c", metadata !"/dir"}
 !11 = metadata !{i32 0}
+!12 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/v-binop-widen.ll b/test/CodeGen/X86/v-binop-widen.ll
index 8655c6c..fca4da6 100644
--- a/test/CodeGen/X86/v-binop-widen.ll
+++ b/test/CodeGen/X86/v-binop-widen.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mcpu=generic -march=x86 -mattr=+sse < %s | FileCheck %s
-; CHECK: divss
 ; CHECK: divps
 ; CHECK: divps
+; CHECK: divss
 
 %vec = type <9 x float>
 define %vec @vecdiv( %vec %p1, %vec %p2)
@@ -9,4 +9,3 @@ define %vec @vecdiv( %vec %p1, %vec %p2)
   %result = fdiv %vec %p1, %p2
   ret %vec %result
 }
-
diff --git a/test/CodeGen/X86/v-binop-widen2.ll b/test/CodeGen/X86/v-binop-widen2.ll
index 569586a..3342111 100644
--- a/test/CodeGen/X86/v-binop-widen2.ll
+++ b/test/CodeGen/X86/v-binop-widen2.ll
@@ -2,9 +2,9 @@
 ; RUN: llc -march=x86 -mcpu=atom -mattr=+sse < %s | FileCheck -check-prefix=ATOM %s
 
 %vec = type <6 x float>
+; CHECK: divps
 ; CHECK: divss
 ; CHECK: divss
-; CHECK: divps
 
 ; Scheduler causes a different instruction order to be produced on Intel Atom
 ; ATOM: divps
diff --git a/test/CodeGen/X86/v4i32load-crash.ll b/test/CodeGen/X86/v4i32load-crash.ll
new file mode 100644
index 0000000..052c4c3
--- /dev/null
+++ b/test/CodeGen/X86/v4i32load-crash.ll
@@ -0,0 +1,27 @@
+; RUN: llc --mcpu=x86-64 --mattr=ssse3 < %s
+
+;PR18045:
+;Issue of selection for 'v4i32 load'.
+;This instruction is not legal for X86 CPUs with sse < 'sse4.1'.
+;This node was generated by X86ISelLowering.cpp, EltsFromConsecutiveLoads
+;static function after legilize stage.
+
+@e = external global [4 x i32], align 4
+@f = external global [4 x i32], align 4
+
+; Function Attrs: nounwind
+define void @fn3(i32 %el) {
+entry:
+  %0 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 0)
+  %1 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 1)
+  %2 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 2)
+  %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i32 3)
+  %4 = insertelement <4 x i32> undef, i32 %0, i32 0
+  %5 = insertelement <4 x i32> %4, i32 %1, i32 1
+  %6 = insertelement <4 x i32> %5, i32 %2, i32 2
+  %7 = insertelement <4 x i32> %6, i32 %3, i32 3
+  %8 = add <4 x i32> %6, %7
+  store <4 x i32> %8, <4 x i32>* bitcast ([4 x i32]* @f to <4 x i32>*)
+  ret void
+}
+
diff --git a/test/CodeGen/X86/vec_compare-sse4.ll b/test/CodeGen/X86/vec_compare-sse4.ll
index a08d9f5..084d611 100644
--- a/test/CodeGen/X86/vec_compare-sse4.ll
+++ b/test/CodeGen/X86/vec_compare-sse4.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=x86 -mattr=-sse3,+sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -march=x86 -mattr=-sse42,+sse41 | FileCheck %s -check-prefix=SSE41
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s -check-prefix=SSE42
+; RUN: llc < %s -march=x86 -mattr=-sse4.2,+sse4.1 | FileCheck %s -check-prefix=SSE41
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s -check-prefix=SSE42
 
 define <2 x i64> @test1(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; SSE42-LABEL: test1:
diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index 42d7f27..3cb519a 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse41 -o %t
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 -o %t
 ; RUN: not grep extractps   %t
 ; RUN: not grep pextrd      %t
 ; RUN: not grep pshufd  %t
diff --git a/test/CodeGen/X86/vec_extract.ll b/test/CodeGen/X86/vec_extract.ll
index 2c8796b..88f5a58 100644
--- a/test/CodeGen/X86/vec_extract.ll
+++ b/test/CodeGen/X86/vec_extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse41 -o %t
+; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 -o %t
 ; RUN: grep movss    %t | count 4
 ; RUN: grep movhlps  %t | count 1
 ; RUN: not grep pshufd   %t 
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 863712f..7ec07ae 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.1,-avx | FileCheck %s
 ; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck --check-prefix=AVX %s
 
 ; PR11674
diff --git a/test/CodeGen/X86/vec_insert-2.ll b/test/CodeGen/X86/vec_insert-2.ll
index bfac1ba..fe20a47 100644
--- a/test/CodeGen/X86/vec_insert-2.ll
+++ b/test/CodeGen/X86/vec_insert-2.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | FileCheck --check-prefix=X32 %s
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse41 | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | FileCheck --check-prefix=X32 %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse4.1 | FileCheck --check-prefix=X64 %s
 
 define <4 x float> @t1(float %s, <4 x float> %tmp) nounwind {
 ; X32-LABEL: t1:
diff --git a/test/CodeGen/X86/vec_insert-3.ll b/test/CodeGen/X86/vec_insert-3.ll
index a18cd864..a871339 100644
--- a/test/CodeGen/X86/vec_insert-3.ll
+++ b/test/CodeGen/X86/vec_insert-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse41 | grep punpcklqdq | count 1
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,-sse4.1 | grep punpcklqdq | count 1
 
 define <2 x i64> @t1(i64 %s, <2 x i64> %tmp) nounwind {
         %tmp1 = insertelement <2 x i64> %tmp, i64 %s, i32 1
diff --git a/test/CodeGen/X86/vec_insert-7.ll b/test/CodeGen/X86/vec_insert-7.ll
index 268b5c4..6d4f828 100644
--- a/test/CodeGen/X86/vec_insert-7.ll
+++ b/test/CodeGen/X86/vec_insert-7.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse42 -mtriple=i686-apple-darwin9 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse4.2 -mtriple=i686-apple-darwin9 | FileCheck %s
 ; MMX insertelement is not available; these are promoted to XMM.
 ; (Without SSE they are split to two ints, and the code is much better.)
 
diff --git a/test/CodeGen/X86/vec_insert-8.ll b/test/CodeGen/X86/vec_insert-8.ll
index 650951c..917832c 100644
--- a/test/CodeGen/X86/vec_insert-8.ll
+++ b/test/CodeGen/X86/vec_insert-8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 -o %t
+; RUN: llc < %s -march=x86 -mattr=+sse4.1 -o %t
 
 ; tests variable insert and extract of a 4 x i32
 
diff --git a/test/CodeGen/X86/vec_insert-9.ll b/test/CodeGen/X86/vec_insert-9.ll
index e5a7ccc..5f2e676 100644
--- a/test/CodeGen/X86/vec_insert-9.ll
+++ b/test/CodeGen/X86/vec_insert-9.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 > %t
+; RUN: llc < %s -march=x86 -mattr=+sse4.1 > %t
 ; RUN: grep pinsrd %t | count 1
 
 define <4 x i32> @var_insert2(<4 x i32> %x, i32 %val, i32 %idx) nounwind  {
diff --git a/test/CodeGen/X86/vec_insert.ll b/test/CodeGen/X86/vec_insert.ll
index 4e5d445..0ed8f10 100644
--- a/test/CodeGen/X86/vec_insert.ll
+++ b/test/CodeGen/X86/vec_insert.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | grep movss | count 1
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | not grep pinsrw
+; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep movss | count 1
+; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | not grep pinsrw
 
 define void @test(<4 x float>* %F, i32 %I) nounwind {
 	%tmp = load <4 x float>* %F		; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_round.ll b/test/CodeGen/X86/vec_round.ll
new file mode 100644
index 0000000..baa2f58
--- /dev/null
+++ b/test/CodeGen/X86/vec_round.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=nehalem -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @use(<2 x double>)
+
+; CHECK-LABEL: @test
+; CHECK callq round
+
+; Function Attrs: nounwind uwtable
+define void @test() {
+entry:
+  %tmp = call <2 x double> @llvm.round.v2f64(<2 x double> undef)
+  call void @use(<2 x double> %tmp)
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <2 x double> @llvm.round.v2f64(<2 x double>) #0
+
+attributes #0 = { nounwind readonly }
+
diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll
index 66056d0..41061ae 100644
--- a/test/CodeGen/X86/vec_set-8.ll
+++ b/test/CodeGen/X86/vec_set-8.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s
 ; CHECK-NOT: movsd
 ; CHECK: movd {{%rdi|%rcx}}, %xmm0
 ; CHECK-NOT: movsd
diff --git a/test/CodeGen/X86/vec_set-9.ll b/test/CodeGen/X86/vec_set-9.ll
index 6979f6b..a739090 100644
--- a/test/CodeGen/X86/vec_set-9.ll
+++ b/test/CodeGen/X86/vec_set-9.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=-avx,-pad-short-functions | FileCheck %s
 
 ; CHECK: test3
 ; CHECK: movd
diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll
index 133f23b..052da30 100644
--- a/test/CodeGen/X86/vec_set-C.ll
+++ b/test/CodeGen/X86/vec_set-C.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2 | grep movq
-; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2 | grep mov | count 1
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux -mattr=+sse2 | grep movd
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2,-avx | grep movq
+; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mattr=+sse2,-avx | grep mov | count 1
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-linux -mattr=+sse2,-avx | grep movd
 
 define <2 x i64> @t1(i64 %x) nounwind  {
 	%tmp8 = insertelement <2 x i64> zeroinitializer, i64 %x, i32 0
diff --git a/test/CodeGen/X86/vec_set.ll b/test/CodeGen/X86/vec_set.ll
index 7f5f8dd..53d880b 100644
--- a/test/CodeGen/X86/vec_set.ll
+++ b/test/CodeGen/X86/vec_set.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-sse41 | grep punpckl | count 7
+; RUN: llc < %s -march=x86 -mattr=+sse2,-sse4.1 | grep punpckl | count 7
 
 define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
         %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0          ; <<8 x i16>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll
index bcfd4d3..fc8a56d 100644
--- a/test/CodeGen/X86/vec_setcc.ll
+++ b/test/CodeGen/X86/vec_setcc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse2 | FileCheck %s -check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse41 | FileCheck %s -check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse4.1 | FileCheck %s -check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx | FileCheck %s -check-prefix=AVX
 
 define <16 x i8> @v16i8_icmp_uge(<16 x i8> %a, <16 x i8> %b) nounwind readnone ssp uwtable {
@@ -124,3 +124,64 @@ define <4 x i32> @v4i32_icmp_ule(<4 x i32> %a, <4 x i32> %b) nounwind readnone s
 ; AVX: pcmpeqd %xmm1, %xmm0, %xmm0
 }
 
+; At one point we were incorrectly constant-folding a setcc to 0x1 instead of
+; 0xff, leading to a constpool load. The instruction doesn't matter here, but it
+; should set all bits to 1.
+define <16 x i8> @test_setcc_constfold_vi8(<16 x i8> %l, <16 x i8> %r) {
+  %test1 = icmp eq <16 x i8> %l, %r
+  %mask1 = sext <16 x i1> %test1 to <16 x i8>
+
+  %test2 = icmp ne <16 x i8> %l, %r
+  %mask2 = sext <16 x i1> %test2 to <16 x i8>
+
+  %res = or <16 x i8> %mask1, %mask2
+  ret <16 x i8> %res
+; SSE2-LABEL: test_setcc_constfold_vi8:
+; SSE2: pcmpeqd %xmm0, %xmm0
+
+; SSE41-LABEL: test_setcc_constfold_vi8:
+; SSE41: pcmpeqd %xmm0, %xmm0
+
+; AVX-LABEL: test_setcc_constfold_vi8:
+; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
+}
+
+; Make sure sensible results come from doing extension afterwards
+define <16 x i8> @test_setcc_constfold_vi1(<16 x i8> %l, <16 x i8> %r) {
+  %test1 = icmp eq <16 x i8> %l, %r
+  %test2 = icmp ne <16 x i8> %l, %r
+
+  %res = or <16 x i1> %test1, %test2
+  %mask = sext <16 x i1> %res to <16 x i8>
+  ret <16 x i8> %mask
+; SSE2-LABEL: test_setcc_constfold_vi1:
+; SSE2: pcmpeqd %xmm0, %xmm0
+
+; SSE41-LABEL: test_setcc_constfold_vi1:
+; SSE41: pcmpeqd %xmm0, %xmm0
+
+; AVX-LABEL: test_setcc_constfold_vi1:
+; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
+}
+
+
+; 64-bit case is also particularly important, as the constant "-1" is probably
+; just 32-bits wide.
+define <2 x i64> @test_setcc_constfold_vi64(<2 x i64> %l, <2 x i64> %r) {
+  %test1 = icmp eq <2 x i64> %l, %r
+  %mask1 = sext <2 x i1> %test1 to <2 x i64>
+
+  %test2 = icmp ne <2 x i64> %l, %r
+  %mask2 = sext <2 x i1> %test2 to <2 x i64>
+
+  %res = or <2 x i64> %mask1, %mask2
+  ret <2 x i64> %res
+; SSE2-LABEL: test_setcc_constfold_vi64:
+; SSE2: pcmpeqd %xmm0, %xmm0
+
+; SSE41-LABEL: test_setcc_constfold_vi64:
+; SSE41: pcmpeqd %xmm0, %xmm0
+
+; AVX-LABEL: test_setcc_constfold_vi64:
+; AVX: vpcmpeqd %xmm0, %xmm0, %xmm0
+}
diff --git a/test/CodeGen/X86/vec_shift4.ll b/test/CodeGen/X86/vec_shift4.ll
index 9ef7fbd..e2fe45c 100644
--- a/test/CodeGen/X86/vec_shift4.ll
+++ b/test/CodeGen/X86/vec_shift4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.1 | FileCheck %s
 
 define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/vec_shuffle-14.ll b/test/CodeGen/X86/vec_shuffle-14.ll
index 95e9a18..8f25197 100644
--- a/test/CodeGen/X86/vec_shuffle-14.ll
+++ b/test/CodeGen/X86/vec_shuffle-14.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s -check-prefix=X86-32
-; RUN: llc < %s -march=x86-64 -mattr=+sse2 | FileCheck %s -check-prefix=X86-64
+; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-32
+; RUN: llc < %s -march=x86-64 -mattr=+sse2,-avx | FileCheck %s -check-prefix=X86-64
 
 define <4 x i32> @t1(i32 %a) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/vec_shuffle-17.ll b/test/CodeGen/X86/vec_shuffle-17.ll
index ebc8c5b..f2f96ba 100644
--- a/test/CodeGen/X86/vec_shuffle-17.ll
+++ b/test/CodeGen/X86/vec_shuffle-17.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=-avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=-avx | FileCheck %s
 ; CHECK-NOT: xor
 ; CHECK: movd {{%rdi|%rcx}}, %xmm0
 ; CHECK-NOT: xor
diff --git a/test/CodeGen/X86/vec_shuffle-25.ll b/test/CodeGen/X86/vec_shuffle-25.ll
index d9b2388..3f42a13 100644
--- a/test/CodeGen/X86/vec_shuffle-25.ll
+++ b/test/CodeGen/X86/vec_shuffle-25.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
+; RUN: llc < %s -march=x86 -mattr=sse4.1 -o %t
 ; RUN: grep unpcklps %t | count 3
 ; RUN: grep unpckhps %t | count 1
  
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
index 4c56f84..00e8e73 100644
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse4.1 | FileCheck %s
 ; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 ; Transpose example using the more generic vector shuffle. Return float8
diff --git a/test/CodeGen/X86/vec_shuffle-27.ll b/test/CodeGen/X86/vec_shuffle-27.ll
index 0aff822..c9b2fb5 100644
--- a/test/CodeGen/X86/vec_shuffle-27.ll
+++ b/test/CodeGen/X86/vec_shuffle-27.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
 
 ; ModuleID = 'vec_shuffle-27.bc'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
@@ -7,10 +7,10 @@ target triple = "i686-apple-cl.1.0"
 define <8 x float> @my2filter4_1d(<4 x float> %a, <8 x float> %T0, <8 x float> %T1) nounwind readnone {
 entry:
 ; CHECK: subps
-; CHECK: mulps
-; CHECK: addps
 ; CHECK: subps
 ; CHECK: mulps
+; CHECK: mulps
+; CHECK: addps
 ; CHECK: addps
 	%tmp7 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3 >		; <<8 x float>> [#uses=1]
 	%sub = fsub <8 x float> %T1, %T0		; <<8 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_shuffle-36.ll b/test/CodeGen/X86/vec_shuffle-36.ll
index 9a06015..f1d0f93 100644
--- a/test/CodeGen/X86/vec_shuffle-36.ll
+++ b/test/CodeGen/X86/vec_shuffle-36.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
 
 define <8 x i16> @shuf6(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
 ; CHECK: pshufb
diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll
index 1560454..8fd9a5c 100644
--- a/test/CodeGen/X86/vec_shuffle-39.ll
+++ b/test/CodeGen/X86/vec_shuffle-39.ll
@@ -54,8 +54,8 @@ entry:
 define <2 x double> @t3() nounwind readonly {
 bb:
 ; CHECK-LABEL: t3:
-; CHECK: punpcklqdq %xmm1, %xmm0
 ; CHECK: movq (%rax), %xmm1
+; CHECK: punpcklqdq %xmm2, %xmm0
 ; CHECK: movsd %xmm1, %xmm0
   %tmp0 = load i128* null, align 1
   %tmp1 = load <2 x i32>* undef, align 8
@@ -72,9 +72,9 @@ bb:
 define <2 x i64> @t4() nounwind readonly {
 bb:
 ; CHECK-LABEL: t4:
-; CHECK: punpcklqdq %xmm0, %xmm1
 ; CHECK: movq (%rax), %xmm0
-; CHECK: movsd %xmm1, %xmm0
+; CHECK: punpcklqdq %{{xmm.}}, %[[XMM:xmm[0-9]]]
+; CHECK: movsd %[[XMM]], %xmm0
   %tmp0 = load i128* null, align 1
   %tmp1 = load <2 x i32>* undef, align 8
   %tmp2 = bitcast i128 %tmp0 to <16 x i8>
diff --git a/test/CodeGen/X86/vec_splat-3.ll b/test/CodeGen/X86/vec_splat-3.ll
index 60e3005..754cbf4 100644
--- a/test/CodeGen/X86/vec_splat-3.ll
+++ b/test/CodeGen/X86/vec_splat-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse41 | FileCheck %s
+; RUN: llc <%s -march=x86 -mcpu=penryn -mattr=sse4.1 | FileCheck %s
 
 ; Splat test for v8i16
 define <8 x i16> @shuf_8i16_0(<8 x i16> %T0, <8 x i16> %T1) nounwind readnone {
diff --git a/test/CodeGen/X86/vec_split.ll b/test/CodeGen/X86/vec_split.ll
new file mode 100644
index 0000000..f9e7c20
--- /dev/null
+++ b/test/CodeGen/X86/vec_split.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s -check-prefix=SSE4
+; RUN: llc -march=x86-64 -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
+; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
+
+define <16 x i16> @split16(<16 x i16> %a, <16 x i16> %b, <16 x i8> %__mask) {
+; SSE4-LABEL: split16:
+; SSE4: pminuw
+; SSE4: pminuw
+; SSE4: ret
+; AVX1-LABEL: split16:
+; AVX1: vpminuw
+; AVX1: vpminuw
+; AVX1: ret
+; AVX2-LABEL: split16:
+; AVX2: vpminuw
+; AVX2: ret
+  %1 = icmp ult <16 x i16> %a, %b
+  %2 = select <16 x i1> %1, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %2
+}
+
+define <32 x i16> @split32(<32 x i16> %a, <32 x i16> %b, <32 x i8> %__mask) {
+; SSE4-LABEL: split32:
+; SSE4: pminuw
+; SSE4: pminuw
+; SSE4: pminuw
+; SSE4: pminuw
+; SSE4: ret
+; AVX1-LABEL: split32:
+; AVX1: vpminuw
+; AVX1: vpminuw
+; AVX1: vpminuw
+; AVX1: vpminuw
+; AVX1: ret
+; AVX2-LABEL: split32:
+; AVX2: vpminuw
+; AVX2: vpminuw
+; AVX2: ret
+  %1 = icmp ult <32 x i16> %a, %b
+  %2 = select <32 x i1> %1, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %2
+}
diff --git a/test/CodeGen/X86/vec_ss_load_fold.ll b/test/CodeGen/X86/vec_ss_load_fold.ll
index 2eb911f..80f12a2 100644
--- a/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse,+sse2,+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s
 
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin8.7.2"
diff --git a/test/CodeGen/X86/vector-variable-idx2.ll b/test/CodeGen/X86/vector-variable-idx2.ll
index d47df90..6e8ae2e 100644
--- a/test/CodeGen/X86/vector-variable-idx2.ll
+++ b/test/CodeGen/X86/vector-variable-idx2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.1
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin11.0.0"
diff --git a/test/CodeGen/X86/vsplit-and.ll b/test/CodeGen/X86/vsplit-and.ll
index 3b7fdff..c16b294 100644
--- a/test/CodeGen/X86/vsplit-and.ll
+++ b/test/CodeGen/X86/vsplit-and.ll
@@ -14,7 +14,7 @@ define void @t0(<2 x i64>* %dst, <2 x i64> %src1, <2 x i64> %src2) nounwind read
 
 define void @t2(<3 x i64>* %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind readonly {
 ; CHECK: t2
-; CHECK-NOT: pand
+; CHECK: pand
 ; CHECK: ret
   %cmp1 = icmp ne <3 x i64> %src1, zeroinitializer
   %cmp2 = icmp ne <3 x i64> %src2, zeroinitializer
diff --git a/test/CodeGen/X86/weak_def_can_be_hidden.ll b/test/CodeGen/X86/weak_def_can_be_hidden.ll
new file mode 100644
index 0000000..f78f357
--- /dev/null
+++ b/test/CodeGen/X86/weak_def_can_be_hidden.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=x86_64-apple-darwin  -O0 < %s | FileCheck %s
+
+@v1 = linkonce_odr global i32 32
+; CHECK: .globl  _v1
+; CHECK: .weak_def_can_be_hidden _v1
+
+define i32 @f1() {
+  %x = load i32 * @v1
+  ret i32 %x
+}
+
+@v2 = linkonce_odr global i32 32
+; CHECK: .globl  _v2
+; CHECK: .weak_definition _v2
+
+@v3 = linkonce_odr unnamed_addr global i32 32
+; CHECK: .globl  _v3
+; CHECK: .weak_def_can_be_hidden _v3
+
+define i32* @f2() {
+  ret i32* @v2
+}
+
+define i32* @f3() {
+  ret i32* @v3
+}
diff --git a/test/CodeGen/X86/widen_arith-1.ll b/test/CodeGen/X86/widen_arith-1.ll
index 661cde8..6041356 100644
--- a/test/CodeGen/X86/widen_arith-1.ll
+++ b/test/CodeGen/X86/widen_arith-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse42 |  FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse4.2 |  FileCheck %s
 
 define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
 entry:
diff --git a/test/CodeGen/X86/widen_arith-2.ll b/test/CodeGen/X86/widen_arith-2.ll
index d35abc3..1b81e9f 100644
--- a/test/CodeGen/X86/widen_arith-2.ll
+++ b/test/CodeGen/X86/widen_arith-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: padd
 ; CHECK: pand
 
diff --git a/test/CodeGen/X86/widen_arith-3.ll b/test/CodeGen/X86/widen_arith-3.ll
index d86042a..d2b8e6e 100644
--- a/test/CodeGen/X86/widen_arith-3.ll
+++ b/test/CodeGen/X86/widen_arith-3.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse42 -post-RA-scheduler=true | FileCheck %s
-; CHECK: incl
-; CHECK: incl
-; CHECK: incl
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+sse4.2 -post-RA-scheduler=true | FileCheck %s
+; CHECK: paddd
 
 ; Widen a v3i16 to v8i16 to do a vector add
 
diff --git a/test/CodeGen/X86/widen_arith-4.ll b/test/CodeGen/X86/widen_arith-4.ll
index 63c8d0e..5207e1f 100644
--- a/test/CodeGen/X86/widen_arith-4.ll
+++ b/test/CodeGen/X86/widen_arith-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: psubw
 ; CHECK-NEXT: pmullw
 
diff --git a/test/CodeGen/X86/widen_arith-5.ll b/test/CodeGen/X86/widen_arith-5.ll
index 41df0e4..70b6a8a 100644
--- a/test/CodeGen/X86/widen_arith-5.ll
+++ b/test/CodeGen/X86/widen_arith-5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse42  | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse4.2  | FileCheck %s
 ; CHECK: movdqa
 ; CHECK: pslld $2
 ; CHECK: psubd
diff --git a/test/CodeGen/X86/widen_arith-6.ll b/test/CodeGen/X86/widen_arith-6.ll
index b983d14..329048a 100644
--- a/test/CodeGen/X86/widen_arith-6.ll
+++ b/test/CodeGen/X86/widen_arith-6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: mulps
 ; CHECK: addps
 
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index 56c6364..d115929 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=x86 -mcpu=generic -mattr=+sse42 < %s | FileCheck %s
+; RUN: llc -march=x86 -mcpu=generic -mattr=+sse4.2 < %s | FileCheck %s
 ; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
-; CHECK: paddd
 ; CHECK: movl
+; CHECK: paddd
 ; CHECK: movlpd
 
 ; Scheduler causes produce a different instruction order
diff --git a/test/CodeGen/X86/widen_cast-2.ll b/test/CodeGen/X86/widen_cast-2.ll
index 3979ce4..40b42fb 100644
--- a/test/CodeGen/X86/widen_cast-2.ll
+++ b/test/CodeGen/X86/widen_cast-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse4.2 | FileCheck %s
 ; CHECK: pextrd
 ; CHECK: pextrd
 ; CHECK: movd
diff --git a/test/CodeGen/X86/widen_cast-3.ll b/test/CodeGen/X86/widen_cast-3.ll
index 87486d9..40a8dc5 100644
--- a/test/CodeGen/X86/widen_cast-3.ll
+++ b/test/CodeGen/X86/widen_cast-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: paddd
 ; CHECK: pextrd
 ; CHECK: pextrd
diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll
index 5ea5426..1bc06a7 100644
--- a/test/CodeGen/X86/widen_cast-4.ll
+++ b/test/CodeGen/X86/widen_cast-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: psraw
 ; CHECK: psraw
 
diff --git a/test/CodeGen/X86/widen_cast-5.ll b/test/CodeGen/X86/widen_cast-5.ll
index 9086d3a..ccf0bd1 100644
--- a/test/CodeGen/X86/widen_cast-5.ll
+++ b/test/CodeGen/X86/widen_cast-5.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: movl
 ; CHECK: movlpd
 
diff --git a/test/CodeGen/X86/widen_cast-6.ll b/test/CodeGen/X86/widen_cast-6.ll
index 3903234..7c06ad8 100644
--- a/test/CodeGen/X86/widen_cast-6.ll
+++ b/test/CodeGen/X86/widen_cast-6.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.1 | FileCheck %s
 ; CHECK: movd
 
 ; Test bit convert that requires widening in the operand.
diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll
index 51f1c88..9f6778c 100644
--- a/test/CodeGen/X86/widen_conv-1.ll
+++ b/test/CodeGen/X86/widen_conv-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: paddq
 
 ; truncate v2i64 to v2i32
diff --git a/test/CodeGen/X86/widen_conv-2.ll b/test/CodeGen/X86/widen_conv-2.ll
index db8fa93..906f7cd 100644
--- a/test/CodeGen/X86/widen_conv-2.ll
+++ b/test/CodeGen/X86/widen_conv-2.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
-; CHECK: cwtl
-; CHECK: cwtl
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
+; CHECK: {{cwtl|movswl}}
+; CHECK: {{cwtl|movswl}}
 
 ; sign extension v2i32 to v2i16
 
diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll
index a25fae9..a2f3d7b 100644
--- a/test/CodeGen/X86/widen_conv-3.ll
+++ b/test/CodeGen/X86/widen_conv-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 ; CHECK: cvtsi2ss
 
 ; sign to float v2i16 to v2f32
diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll
index 1158e04..f633592 100644
--- a/test/CodeGen/X86/widen_conv-4.ll
+++ b/test/CodeGen/X86/widen_conv-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=nehalem -mattr=+sse4.2 | FileCheck %s
 ; CHECK-NOT: cvtsi2ss
 
 ; unsigned to float v7i16 to v7f32
diff --git a/test/CodeGen/X86/widen_extract-1.ll b/test/CodeGen/X86/widen_extract-1.ll
index c4fe43a..6832de1 100644
--- a/test/CodeGen/X86/widen_extract-1.ll
+++ b/test/CodeGen/X86/widen_extract-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mcpu=nehalem -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=nehalem -mattr=+sse4.2 | FileCheck %s
 ; widen extract subvector
 
 define void @convert(<2 x double>* %dst.addr, <3 x double> %src)  {
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index f0f94e4..26815a4 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse4.2 | FileCheck %s
 
 ; Test based on pr5626 to load/store
 ;
@@ -73,9 +73,7 @@ define void @add12i32(%i32vec12*  sret %ret, %i32vec12* %ap, %i32vec12* %bp)  {
 ; CHECK: add3i16
 %i16vec3 = type <3 x i16>
 define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind {
-; CHECK: addl
-; CHECK: addl
-; CHECK: addl
+; CHECK: paddd
 ; CHECK: ret
 	%a = load %i16vec3* %ap, align 16
 	%b = load %i16vec3* %bp, align 16
@@ -135,9 +133,7 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18*
 ; CHECK: add3i8
 %i8vec3 = type <3 x i8>
 define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind {
-; CHECK: addb
-; CHECK: addb
-; CHECK: addb
+; CHECK: paddd
 ; CHECK: ret
 	%a = load %i8vec3* %ap, align 16
 	%b = load %i8vec3* %bp, align 16
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index c7d2044..803402b 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse42 | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+sse4.2 | FileCheck %s
 
 ; widening shuffle v3float and then a add
 define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
diff --git a/test/CodeGen/X86/win64_alloca_dynalloca.ll b/test/CodeGen/X86/win64_alloca_dynalloca.ll
index 275ebf9..aff5305 100644
--- a/test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ b/test/CodeGen/X86/win64_alloca_dynalloca.ll
@@ -1,10 +1,13 @@
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
+; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-mingw32     | FileCheck %s -check-prefix=M64
+; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32       | FileCheck %s -check-prefix=W64
+; RUN: llc < %s -mcpu=generic -enable-misched=false -mtriple=x86_64-win32-macho | FileCheck %s -check-prefix=EFI
 ; PR8777
 ; PR8778
 
-define i64 @foo(i64 %n, i64 %x) nounwind {
+define i64 @unaligned(i64 %n, i64 %x) nounwind {
+; M64-LABEL: unaligned:
+; W64-LABEL: unaligned:
+; EFI-LABEL: unaligned:
 entry:
 
   %buf0 = alloca i8, i64 4096, align 1
@@ -49,18 +52,18 @@ entry:
   %r = call i64 @bar(i64 %n, i64 %x, i64 %n, i8* %buf0, i8* %buf1) nounwind
 
 ; M64: subq  $48, %rsp
-; M64: leaq  -4096(%rbp), %r9
 ; M64: movq  %rax, 32(%rsp)
+; M64: leaq  -4096(%rbp), %r9
 ; M64: callq bar
 
 ; W64: subq  $48, %rsp
-; W64: leaq  -4096(%rbp), %r9
 ; W64: movq  %rax, 32(%rsp)
+; W64: leaq  -4096(%rbp), %r9
 ; W64: callq bar
 
 ; EFI: subq  $48, %rsp
-; EFI: leaq  -[[B0OFS]](%rbp), %r9
 ; EFI: movq  [[R64]], 32(%rsp)
+; EFI: leaq  -[[B0OFS]](%rbp), %r9
 ; EFI: callq _bar
 
   ret i64 %r
@@ -71,4 +74,51 @@ entry:
 
 }
 
+define i64 @aligned(i64 %n, i64 %x) nounwind {
+; M64-LABEL: aligned:
+; W64-LABEL: aligned:
+; EFI-LABEL: aligned:
+entry:
+
+  %buf1 = alloca i8, i64 %n, align 128
+
+; M64: leaq  15(%{{.*}}), %rax
+; M64: andq  $-16, %rax
+; M64: callq ___chkstk
+; M64: movq  %rsp, [[R2:%r.*]]
+; M64: andq  $-128, [[R2]]
+; M64: movq  [[R2]], %rsp
+
+; W64: leaq  15(%{{.*}}), %rax
+; W64: andq  $-16, %rax
+; W64: callq __chkstk
+; W64: subq  %rax, %rsp
+; W64: movq  %rsp, [[R2:%r.*]]
+; W64: andq  $-128, [[R2]]
+; W64: movq  [[R2]], %rsp
+
+; EFI: leaq  15(%{{.*}}), [[R1:%r.*]]
+; EFI: andq  $-16, [[R1]]
+; EFI: movq  %rsp, [[R64:%r.*]]
+; EFI: subq  [[R1]], [[R64]]
+; EFI: andq  $-128, [[R64]]
+; EFI: movq  [[R64]], %rsp
+
+  %r = call i64 @bar(i64 %n, i64 %x, i64 %n, i8* undef, i8* %buf1) nounwind
+
+; M64: subq  $48, %rsp
+; M64: movq  [[R2]], 32(%rsp)
+; M64: callq bar
+
+; W64: subq  $48, %rsp
+; W64: movq  [[R2]], 32(%rsp)
+; W64: callq bar
+
+; EFI: subq  $48, %rsp
+; EFI: movq  [[R64]], 32(%rsp)
+; EFI: callq _bar
+
+  ret i64 %r
+}
+
 declare i64 @bar(i64, i64, i64, i8* nocapture, i8* nocapture) nounwind
diff --git a/test/CodeGen/X86/x86-64-pic-10.ll b/test/CodeGen/X86/x86-64-pic-10.ll
index 3ec172b..da8082b 100644
--- a/test/CodeGen/X86/x86-64-pic-10.ll
+++ b/test/CodeGen/X86/x86-64-pic-10.ll
@@ -9,4 +9,6 @@ entry:
         ret void
 }
 
-declare extern_weak i32 @f()
+define weak i32 @f() {
+  ret i32 42
+}
diff --git a/test/CodeGen/X86/x86-64-psub.ll b/test/CodeGen/X86/x86-64-psub.ll
index be09a4f..183ddf4 100644
--- a/test/CodeGen/X86/x86-64-psub.ll
+++ b/test/CodeGen/X86/x86-64-psub.ll
@@ -4,8 +4,8 @@
 ; This test checks that the operands of packed sub instructions are
 ; never interchanged by the "Two-Address instruction pass".
 
-declare { i64, double } @getFirstParam() 
-declare { i64, double } @getSecondParam() 
+declare { i64, double } @getFirstParam()
+declare { i64, double } @getSecondParam()
 
 define i64 @test_psubb() {
 entry:
@@ -28,9 +28,10 @@ entry:
 
 ; CHECK-LABEL: test_psubb:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubb [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -55,9 +56,10 @@ entry:
 
 ; CHECK-LABEL: test_psubw:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubw [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -83,9 +85,10 @@ entry:
 
 ; CHECK-LABEL: test_psubd:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubd [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -110,9 +113,10 @@ entry:
 
 ; CHECK-LABEL: test_psubsb:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubsb [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -137,9 +141,10 @@ entry:
 
 ; CHECK-LABEL: test_psubswv:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubsw [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -164,9 +169,10 @@ entry:
 
 ; CHECK-LABEL: test_psubusbv:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubusb [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
@@ -191,9 +197,10 @@ entry:
 
 ; CHECK-LABEL: test_psubuswv:
 ; CHECK:   callq getFirstParam
+; CHECK:   movq %rax, [[TEMP:%[a-z0-9]+]]
 ; CHECK:   callq getSecondParam
+; CHECK:   movd [[TEMP]], [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   movd %rax, [[PARAM2:%[a-z0-9]+]]
-; CHECK:   movq (%rsp), [[PARAM1:%[a-z0-9]+]]
 ; CHECK:   psubusw [[PARAM2]], [[PARAM1]]
 ; CHECK: ret
 
diff --git a/test/CodeGen/X86/x86-64-tls-1.ll b/test/CodeGen/X86/x86-64-tls-1.ll
index 8d3b300..641786f 100644
--- a/test/CodeGen/X86/x86-64-tls-1.ll
+++ b/test/CodeGen/X86/x86-64-tls-1.ll
@@ -1,6 +1,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 @tm_nest_level = internal thread_local global i32 0
 define i64 @z() nounwind {
-; CHECK: movabsq    $tm_nest_level@TPOFF, %rcx
+; FIXME: The codegen here is primitive at best and could be much better.
+; The add and the moves can be folded together.
+; CHECK-DAG: movq    $tm_nest_level@TPOFF, %rcx
+; CHECK-DAG: movq    %fs:0, %rax
+; CHECK: addl    %ecx, %eax
   ret i64 and (i64 ptrtoint (i32* @tm_nest_level to i64), i64 100)
 }
diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll
index af57e5c..2f3adb8 100644
--- a/test/CodeGen/X86/x86-shifts.ll
+++ b/test/CodeGen/X86/x86-shifts.ll
@@ -6,8 +6,8 @@
 define <4 x i32> @shl4(<4 x i32> %A) nounwind {
 entry:
 ; CHECK:      shl4
-; CHECK:      padd
 ; CHECK:      pslld
+; CHECK:      padd
 ; CHECK:      ret
   %B = shl <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
   %C = shl <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
@@ -67,8 +67,8 @@ entry:
 define <8 x i16> @shl8(<8 x i16> %A) nounwind {
 entry:
 ; CHECK:      shl8
-; CHECK:      padd
 ; CHECK:      psllw
+; CHECK:      padd
 ; CHECK:      ret
   %B = shl <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
   %C = shl <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
diff --git a/test/CodeGen/X86/xor.ll b/test/CodeGen/X86/xor.ll
index b56ce0f..fd8e1b4 100644
--- a/test/CodeGen/X86/xor.ll
+++ b/test/CodeGen/X86/xor.ll
@@ -165,3 +165,19 @@ define <4 x i32> @test10(<4 x i32> %a) nounwind {
 ; X32-LABEL: test10:
 ; X32:    andnps
 }
+
+define i32 @PR17487(i1 %tobool) {
+  %tmp = insertelement <2 x i1> undef, i1 %tobool, i32 1
+  %tmp1 = zext <2 x i1> %tmp to <2 x i64>
+  %tmp2 = xor <2 x i64> %tmp1, <i64 1, i64 1>
+  %tmp3 = extractelement <2 x i64> %tmp2, i32 1
+  %add = add nsw i64 0, %tmp3
+  %cmp6 = icmp ne i64 %add, 1
+  %conv7 = zext i1 %cmp6 to i32
+  ret i32 %conv7
+
+; X64-LABEL: PR17487:
+; X64: andn
+; X32-LABEL: PR17487:
+; X32: andn
+}
diff --git a/test/CodeGen/X86/zext-fold.ll b/test/CodeGen/X86/zext-fold.ll
index ff93c68..a10923f 100644
--- a/test/CodeGen/X86/zext-fold.ll
+++ b/test/CodeGen/X86/zext-fold.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -enable-misched=false | FileCheck %s
 
 ;; Simple case
 define i32 @test1(i8 %x) nounwind readnone {
@@ -10,7 +10,7 @@ define i32 @test1(i8 %x) nounwind readnone {
 ; CHECK: movzbl
 ; CHECK-NEXT: andl {{.*}}224
 
-;; Multiple uses of %x but easily extensible. 
+;; Multiple uses of %x but easily extensible.
 define i32 @test2(i8 %x) nounwind readnone {
   %A = and i8 %x, -32
   %B = zext i8 %A to i32
@@ -21,8 +21,8 @@ define i32 @test2(i8 %x) nounwind readnone {
 }
 ; CHECK: test2
 ; CHECK: movzbl
-; CHECK: orl $63
 ; CHECK: andl $224
+; CHECK: orl $63
 
 declare void @use(i32, i8)
 
diff --git a/test/CodeGen/X86/zext-sext.ll b/test/CodeGen/X86/zext-sext.ll
index 25dabbe..5b2713d 100644
--- a/test/CodeGen/X86/zext-sext.ll
+++ b/test/CodeGen/X86/zext-sext.ll
@@ -34,10 +34,10 @@ entry:
   %tmp12 = add i64 %tmp11, 5089792279245435153
 
 ; CHECK:      addl	$2138875574, %e[[REGISTER_zext:[a-z0-9]+]]
-; CHECK-NEXT: cmpl	$-8608074, %e[[REGISTER_zext]]
-; CHECK:      movslq	%e[[REGISTER_zext]], [[REGISTER_tmp:%r[a-z0-9]+]]
-; CHECK:      movq	[[REGISTER_tmp]], [[REGISTER_sext:%r[a-z0-9]+]]
+; CHECK:      movslq	%e[[REGISTER_zext]], [[REGISTER_sext:%r[a-z0-9]+]]
+; CHECK:      cmpl	$-8608074, %e[[REGISTER_zext]]
 ; CHECK-NOT:  [[REGISTER_zext]]
+; CHECK-DAG:  testl     %e[[REGISTER_zext]]
 ; CHECK:      subq	%r[[REGISTER_zext]], [[REGISTER_sext]]
 
   %tmp13 = sub i64 %tmp12, 2138875574
diff --git a/test/CodeGen/XCore/aliases.ll b/test/CodeGen/XCore/aliases.ll
index d4da63c..b7ad416 100644
--- a/test/CodeGen/XCore/aliases.ll
+++ b/test/CodeGen/XCore/aliases.ll
@@ -1,7 +1,9 @@
 ; RUN: llc < %s -march=xcore | FileCheck %s
-declare void @a_val() nounwind
-@b_val = external constant i32, section ".cp.rodata"
-@c_val = external global i32
+define void @a_val() nounwind {
+  ret void
+}
+@b_val = constant i32 42, section ".cp.rodata"
+@c_val = global i32 42
 
 @a = alias void ()* @a_val
 @b = alias i32* @b_val
diff --git a/test/CodeGen/XCore/alignment.ll b/test/CodeGen/XCore/alignment.ll
new file mode 100644
index 0000000..28bdf3b
--- /dev/null
+++ b/test/CodeGen/XCore/alignment.ll
@@ -0,0 +1,9 @@
+; RUN: not llc < %s -march=xcore 2>&1 | FileCheck %s
+
+; CHECK: emitPrologue unsupported alignment: 8
+define void @f() nounwind {
+entry:
+  %BadAlignment = alloca i64, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/XCore/ashr.ll b/test/CodeGen/XCore/ashr.ll
index 2752f52..78cb144 100644
--- a/test/CodeGen/XCore/ashr.ll
+++ b/test/CodeGen/XCore/ashr.ll
@@ -1,26 +1,26 @@
 ; RUN: llc < %s -march=xcore -asm-verbose=0 | FileCheck %s
-define i32 @ashr(i32 %a, i32 %b) {
+define i32 @ashr(i32 %a, i32 %b) nounwind {
 	%1 = ashr i32 %a, %b
 	ret i32 %1
 }
 ; CHECK-LABEL: ashr:
 ; CHECK-NEXT: ashr r0, r0, r1
 
-define i32 @ashri1(i32 %a) {
+define i32 @ashri1(i32 %a) nounwind {
 	%1 = ashr i32 %a, 24
 	ret i32 %1
 }
 ; CHECK-LABEL: ashri1:
 ; CHECK-NEXT: ashr r0, r0, 24
 
-define i32 @ashri2(i32 %a) {
+define i32 @ashri2(i32 %a) nounwind {
 	%1 = ashr i32 %a, 31
 	ret i32 %1
 }
 ; CHECK-LABEL: ashri2:
 ; CHECK-NEXT: ashr r0, r0, 32
 
-define i32 @f1(i32 %a) {
+define i32 @f1(i32 %a) nounwind nounwind {
         %1 = icmp slt i32 %a, 0
 	br i1 %1, label %less, label %not_less
 less:
@@ -32,7 +32,7 @@ not_less:
 ; CHECK-NEXT: ashr r0, r0, 32
 ; CHECK-NEXT: bt r0
 
-define i32 @f2(i32 %a) {
+define i32 @f2(i32 %a) nounwind {
         %1 = icmp sge i32 %a, 0
 	br i1 %1, label %greater, label %not_greater
 greater:
@@ -44,7 +44,7 @@ not_greater:
 ; CHECK-NEXT: ashr r0, r0, 32
 ; CHECK-NEXT: bt r0
 
-define i32 @f3(i32 %a) {
+define i32 @f3(i32 %a) nounwind {
         %1 = icmp slt i32 %a, 0
 	%2 = select i1 %1, i32 10, i32 17
 	ret i32 %2
@@ -55,7 +55,7 @@ define i32 @f3(i32 %a) {
 ; CHECK-NEXT: ldc r0, 17
 ; CHECK: ldc r0, 10
 
-define i32 @f4(i32 %a) {
+define i32 @f4(i32 %a) nounwind {
         %1 = icmp sge i32 %a, 0
 	%2 = select i1 %1, i32 10, i32 17
 	ret i32 %2
@@ -66,7 +66,7 @@ define i32 @f4(i32 %a) {
 ; CHECK-NEXT: ldc r0, 10
 ; CHECK: ldc r0, 17
 
-define i32 @f5(i32 %a) {
+define i32 @f5(i32 %a) nounwind {
         %1 = icmp sge i32 %a, 0
 	%2 = zext i1 %1 to i32
 	ret i32 %2
diff --git a/test/CodeGen/XCore/atomic.ll b/test/CodeGen/XCore/atomic.ll
new file mode 100644
index 0000000..95fca9a
--- /dev/null
+++ b/test/CodeGen/XCore/atomic.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+
+; CHECK-LABEL: atomic_fence
+; CHECK: #MEMBARRIER
+; CHECK: #MEMBARRIER
+; CHECK: #MEMBARRIER
+; CHECK: #MEMBARRIER
+; CHECK: retsp 0
+define void @atomic_fence() nounwind {
+entry:
+  fence acquire
+  fence release
+  fence acq_rel
+  fence seq_cst
+  ret void
+}
diff --git a/test/CodeGen/XCore/byVal.ll b/test/CodeGen/XCore/byVal.ll
index a5d25d2..e9612fd 100644
--- a/test/CodeGen/XCore/byVal.ll
+++ b/test/CodeGen/XCore/byVal.ll
@@ -56,3 +56,18 @@ entry:
   call void @f2(i32 %i, %struct.st2* %s2)
   ret void
 }
+
+; CHECK-LABEL: f3Test
+; CHECK: entsp 2
+; CHECK: ldc r1, 0
+; CHECK: ld8u r2, r0[r1]
+; CHECK: ldaw r0, sp[1]
+; CHECK: st8 r2, r0[r1]
+; CHECK: bl f
+; CHECK: retsp 2
+declare void @f3(i8*) nounwind
+define void @f3Test(i8* byval %v) nounwind {
+entry:
+  call void @f3(i8* %v) nounwind
+  ret void
+}
diff --git a/test/CodeGen/XCore/exception.ll b/test/CodeGen/XCore/exception.ll
new file mode 100644
index 0000000..8018cdc
--- /dev/null
+++ b/test/CodeGen/XCore/exception.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+
+declare void @g()
+declare i32 @__gxx_personality_v0(...)
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+declare i8* @__cxa_begin_catch(i8*)
+declare void @__cxa_end_catch()
+declare i8* @__cxa_allocate_exception(i32)
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+@_ZTIi = external constant i8*
+@_ZTId = external constant i8*
+
+; CHECK-LABEL: fn_typeid:
+; CHECK: .cfi_startproc
+; CHECK: mkmsk r0, 1
+; CHECK: retsp 0
+; CHECK: .cfi_endproc
+define i32 @fn_typeid() {
+entry:
+  %0 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  ret i32 %0
+}
+
+; CHECK-LABEL: fn_throw
+; CHECK: .cfi_startproc
+; CHECK: entsp 1
+; CHECK: .cfi_def_cfa_offset 4
+; CHECK: .cfi_offset 15, 0
+; CHECK: ldc r0, 4
+; CHECK: bl __cxa_allocate_exception
+; CHECK: ldaw r11, cp[_ZTIi]
+; CHECK: ldc r2, 0
+; CHECK: mov r1, r11
+; CHECK: bl __cxa_throw
+define void @fn_throw() {
+entry:
+  %0 = call i8* @__cxa_allocate_exception(i32 4) nounwind
+  call void @__cxa_throw(i8* %0, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn
+  unreachable
+}
+
+; CHECK-LABEL: fn_catch
+; CHECK: .cfi_startproc
+; CHECK: .cfi_personality 0, __gxx_personality_v0
+; CHECK: [[START:.L[a-zA-Z0-9_]+]]
+; CHECK: .cfi_lsda 0, [[LSDA:.L[a-zA-Z0-9_]+]]
+; CHECK: entsp 4
+; CHECK: .cfi_def_cfa_offset 16
+; CHECK: .cfi_offset 15, 0
+define void @fn_catch() {
+entry:
+
+; N.B. we alloc no variables, hence force compiler to spill
+; CHECK: stw r4, sp[3]
+; CHECK: .cfi_offset 4, -4
+; CHECK: stw r5, sp[2]
+; CHECK: .cfi_offset 5, -8
+; CHECK: stw r6, sp[1]
+; CHECK: .cfi_offset 6, -12
+; CHECK: [[PRE_G:.L[a-zA-Z0-9_]+]]
+; CHECK: bl g
+; CHECK: [[POST_G:.L[a-zA-Z0-9_]+]]
+; CHECK: [[RETURN:.L[a-zA-Z0-9_]+]]
+; CHECK: ldw r6, sp[1]
+; CHECK: ldw r5, sp[2]
+; CHECK: ldw r4, sp[3]
+; CHECK: retsp 4
+  invoke void @g() to label %cont unwind label %lpad
+cont:
+  ret void
+
+; CHECK: {{.L[a-zA-Z0-9_]+}}
+; CHECK: [[LANDING:.L[a-zA-Z0-9_]+]]
+; CHECK: mov r5, r1
+; CHECK: mov r4, r0
+; CHECK: bl __cxa_begin_catch
+; CHECK: ldw r6, r0[0]
+; CHECK: bl __cxa_end_catch
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+          catch i8* bitcast (i8** @_ZTId to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = extractvalue { i8*, i32 } %0, 1
+  %3 = call i8* @__cxa_begin_catch(i8* %1) nounwind
+  %4 = bitcast i8* %3 to i32*
+  %5 = load i32* %4
+  call void @__cxa_end_catch() nounwind
+
+; CHECK: eq r0, r6, r5
+; CHECK: bf r0, [[RETURN]]
+; CHECK: mov r0, r4
+; CHECK: bl _Unwind_Resume
+; CHECK: .cfi_endproc
+; CHECK: [[END:.L[a-zA-Z0-9_]+]]
+  %6 = icmp eq i32 %5, %2
+  br i1 %6, label %Resume, label %Exit
+Resume:
+  resume { i8*, i32 } %0
+Exit:
+  ret void
+}
+
+; CHECK: [[LSDA]]:
+; CHECK: .byte  255
+; CHECK: .byte  0
+; CHECK: .asciiz
+; CHECK: .byte  3
+; CHECK: .byte  26
+; CHECK: [[SET0:.L[a-zA-Z0-9_]+]] = [[PRE_G]]-[[START]]
+; CHECK: .long [[SET0]]
+; CHECK: [[SET1:.L[a-zA-Z0-9_]+]] = [[POST_G]]-[[PRE_G]]
+; CHECK: .long [[SET1]]
+; CHECK: [[SET2:.L[a-zA-Z0-9_]+]] = [[LANDING]]-[[START]]
+; CHECK: .long [[SET2]]
+; CHECK: .byte 3
+; CHECK: [[SET3:.L[a-zA-Z0-9_]+]] = [[POST_G]]-[[START]]
+; CHECK: .long [[SET3]]
+; CHECK: [[SET4:.L[a-zA-Z0-9_]+]] = [[END]]-[[POST_G]]
+; CHECK: .long [[SET4]]
+; CHECK: .long 0
+; CHECK: .byte 0
+; CHECK: .byte 1
+; CHECK: .byte 0
+; CHECK: .byte 2
+; CHECK: .byte 125
+; CHECK: .long _ZTIi
+; CHECK: .long _ZTId
diff --git a/test/CodeGen/XCore/globals.ll b/test/CodeGen/XCore/globals.ll
index b140587..b3a872b 100644
--- a/test/CodeGen/XCore/globals.ll
+++ b/test/CodeGen/XCore/globals.ll
@@ -93,4 +93,4 @@ entry:
 
 @array = global [10 x i16] zeroinitializer, align 2
 ; CHECK: .globl  array.globound
-; CHECK: .set  array.globound,10
+; CHECK:  array.globound = 10
diff --git a/test/CodeGen/XCore/linkage.ll b/test/CodeGen/XCore/linkage.ll
new file mode 100644
index 0000000..7a1179b
--- /dev/null
+++ b/test/CodeGen/XCore/linkage.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=xcore | FileCheck %s
+
+; CHECK: .weak fd
+define weak void @fd() {
+  call void @fr(i32* @gd, i32* @gr)
+  ret void
+}
+
+; CHECK-NOT: .hidden test_hidden
+declare hidden void @test_hidden_declaration()
+define hidden void @test_hidden() {
+  call void @test_hidden_declaration()
+  unreachable
+}
+
+; CHECK-NOT: .protected
+define protected void @test_protected() {
+  unreachable
+}
+
+; CHECK: .globl array.globound
+; CHECK: array.globound = 2
+; CHECK: .weak array.globound
+; CHECK: .globl array
+; CHECK: .weak array
+@array = weak global [2 x i32] zeroinitializer
+
+; CHECK: .weak gd
+@gd = weak global i32 0
+
+; CHECK-NOT: .hidden test_hidden_declaration
+
+; CHECK: .weak gr
+@gr = extern_weak global i32
+
+; CHECK: .weak fr
+declare extern_weak void @fr(i32*, i32*)
+
diff --git a/test/CodeGen/XCore/lit.local.cfg b/test/CodeGen/XCore/lit.local.cfg
index 8756f37..3e84c1b 100644
--- a/test/CodeGen/XCore/lit.local.cfg
+++ b/test/CodeGen/XCore/lit.local.cfg
@@ -1,5 +1,3 @@
-config.suffixes = ['.ll', '.c', '.cpp', '.test']
-
 targets = set(config.root.targets_to_build.split())
 if not 'XCore' in targets:
     config.unsupported = True
diff --git a/test/CodeGen/XCore/shedulingPreference.ll b/test/CodeGen/XCore/shedulingPreference.ll
new file mode 100644
index 0000000..6c2ac6d
--- /dev/null
+++ b/test/CodeGen/XCore/shedulingPreference.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=xcore
+
+define void @f( ) {
+entry:
+
+  switch i32 undef, label %default [
+    i32 0, label %start
+  ]
+
+start:
+  br label %end
+
+default:
+  %arg = fadd double undef, undef
+  %res = call double @f2(i32 undef, double %arg, double undef)
+  br label %end
+
+end:
+  %unused = phi double [ %res, %default ], [ undef, %start ]
+
+  unreachable
+}
+
+declare double @f2(i32, double, double)
+
diff --git a/test/CodeGen/XCore/threads.ll b/test/CodeGen/XCore/threads.ll
index 5840e77..c50da1d 100644
--- a/test/CodeGen/XCore/threads.ll
+++ b/test/CodeGen/XCore/threads.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=xcore < %s | FileCheck %s
+; RUN: llc -march=xcore -O=0 < %s | FileCheck %s -check-prefix=PHINODE
 
 declare i8 addrspace(1)* @llvm.xcore.getst.p1i8.p1i8(i8 addrspace(1)* %r)
 declare void @llvm.xcore.msync.p1i8(i8 addrspace(1)* %r)
@@ -13,55 +14,132 @@ declare void @llvm.xcore.initdp.p1i8(i8 addrspace(1)* %r, i8* %value)
 define i8 addrspace(1)* @test_getst(i8 addrspace(1)* %r) {
 ; CHECK-LABEL: test_getst:
 ; CHECK: getst r0, res[r0]
-        %result = call i8 addrspace(1)* @llvm.xcore.getst.p1i8.p1i8(i8 addrspace(1)* %r)
-        ret i8 addrspace(1)* %result
+  %result = call i8 addrspace(1)* @llvm.xcore.getst.p1i8.p1i8(i8 addrspace(1)* %r)
+  ret i8 addrspace(1)* %result
 }
 
 define void @test_ssync() {
 ; CHECK-LABEL: test_ssync:
 ; CHECK: ssync
-	call void @llvm.xcore.ssync()
-	ret void
+  call void @llvm.xcore.ssync()
+  ret void
 }
 
 define void @test_mjoin(i8 addrspace(1)* %r) {
 ; CHECK-LABEL: test_mjoin:
 ; CHECK: mjoin res[r0]
-	call void @llvm.xcore.mjoin.p1i8(i8 addrspace(1)* %r)
-	ret void
+  call void @llvm.xcore.mjoin.p1i8(i8 addrspace(1)* %r)
+  ret void
 }
 
 define void @test_initsp(i8 addrspace(1)* %t, i8* %src) {
 ; CHECK-LABEL: test_initsp:
 ; CHECK: init t[r0]:sp, r1
-        call void @llvm.xcore.initsp.p1i8(i8 addrspace(1)* %t, i8* %src)
-        ret void
+  call void @llvm.xcore.initsp.p1i8(i8 addrspace(1)* %t, i8* %src)
+  ret void
 }
 
 define void @test_initpc(i8 addrspace(1)* %t, i8* %src) {
 ; CHECK-LABEL: test_initpc:
 ; CHECK: init t[r0]:pc, r1
-        call void @llvm.xcore.initpc.p1i8(i8 addrspace(1)* %t, i8* %src)
-        ret void
+  call void @llvm.xcore.initpc.p1i8(i8 addrspace(1)* %t, i8* %src)
+  ret void
 }
 
 define void @test_initlr(i8 addrspace(1)* %t, i8* %src) {
 ; CHECK-LABEL: test_initlr:
 ; CHECK: init t[r0]:lr, r1
-        call void @llvm.xcore.initlr.p1i8(i8 addrspace(1)* %t, i8* %src)
-        ret void
+  call void @llvm.xcore.initlr.p1i8(i8 addrspace(1)* %t, i8* %src)
+  ret void
 }
 
 define void @test_initcp(i8 addrspace(1)* %t, i8* %src) {
 ; CHECK-LABEL: test_initcp:
 ; CHECK: init t[r0]:cp, r1
-        call void @llvm.xcore.initcp.p1i8(i8 addrspace(1)* %t, i8* %src)
-        ret void
+  call void @llvm.xcore.initcp.p1i8(i8 addrspace(1)* %t, i8* %src)
+  ret void
 }
 
 define void @test_initdp(i8 addrspace(1)* %t, i8* %src) {
 ; CHECK-LABEL: test_initdp:
 ; CHECK: init t[r0]:dp, r1
-        call void @llvm.xcore.initdp.p1i8(i8 addrspace(1)* %t, i8* %src)
-        ret void
+  call void @llvm.xcore.initdp.p1i8(i8 addrspace(1)* %t, i8* %src)
+  ret void
 }
+
+@tl = thread_local global [3 x i32] zeroinitializer
+@tle = external thread_local global [2 x i32]
+
+define i32* @f_tl() {
+; CHECK-LABEL: f_tl:
+; CHECK: get r11, id
+; CHECK: ldaw [[R0:r[0-9]]], dp[tl]
+; CHECK: ldc [[R1:r[0-9]]], 8
+; CHECK: ldc [[R2:r[0-9]]], 12
+; r0 = id*12 + 8 + &tl
+; CHECK: lmul {{r[0-9]}}, r0, r11, [[R2]], [[R0]], [[R1]]
+  ret i32* getelementptr inbounds ([3 x i32]* @tl, i32 0, i32 2)
+}
+
+define i32* @f_tle() {
+; CHECK-LABEL: f_tle:
+; CHECK: get r11, id
+; CHECK: shl [[R0:r[0-9]]], r11, 3
+; CHECK: ldaw [[R1:r[0-9]]], dp[tle]
+; r0 = &tl + id*8
+; CHECK: add r0, [[R1]], [[R0]]
+  ret i32* getelementptr inbounds ([2 x i32]* @tle, i32 0, i32 0)
+}
+
+define i32 @f_tlExpr () {
+; CHECK-LABEL: f_tlExpr:
+; CHECK: get r11, id
+; CHECK: shl [[R0:r[0-9]]], r11, 3
+; CHECK: ldaw [[R1:r[0-9]]], dp[tle]
+; CHECK: add [[R2:r[0-9]]], [[R1]], [[R0]]
+; CHECK: add r0, [[R2]], [[R2]]
+  ret i32 add(
+      i32 ptrtoint( i32* getelementptr inbounds ([2 x i32]* @tle, i32 0, i32 0) to i32),
+      i32 ptrtoint( i32* getelementptr inbounds ([2 x i32]* @tle, i32 0, i32 0) to i32))
+}
+
+define void @phiNode1() {
+; N.B. lowering of duplicate constexpr in a PHI node requires -O=0
+; PHINODE-LABEL: phiNode1:
+; PHINODE: get r11, id
+; PHINODE-LABEL: .LBB11_1:
+; PHINODE: get r11, id
+; PHINODE: bu .LBB11_1
+entry:
+  br label %ConstantExpPhiNode
+ConstantExpPhiNode:
+  %ptr = phi i32* [ getelementptr inbounds ([3 x i32]* @tl, i32 0, i32 0), %entry ],
+                  [ getelementptr inbounds ([3 x i32]* @tl, i32 0, i32 0), %ConstantExpPhiNode ]
+  br label %ConstantExpPhiNode
+exit:
+  ret void
+}
+
+define void @phiNode2( i1 %bool) {
+; N.B. check an extra 'Node_crit_edge' (LBB12_1) is inserted
+; PHINODE-LABEL: phiNode2:
+; PHINODE: bf {{r[0-9]}}, .LBB12_3
+; PHINODE: bu .LBB12_1
+; PHINODE-LABEL: .LBB12_1:
+; PHINODE: get r11, id
+; PHINODE-LABEL: .LBB12_2:
+; PHINODE: get r11, id
+; PHINODE: bu .LBB12_2
+; PHINODE-LABEL: .LBB12_3:
+entry:
+  br i1 %bool, label %ConstantExpPhiNode, label %exit
+ConstantExpPhiNode:
+  %ptr = phi i32* [ getelementptr inbounds ([3 x i32]* @tl, i32 0, i32 0), %entry ],
+                  [ getelementptr inbounds ([3 x i32]* @tl, i32 0, i32 0), %ConstantExpPhiNode ]
+  br label %ConstantExpPhiNode
+exit:
+  ret void
+}
+
+; CHECK-LABEL: tl:
+; CHECK: .space  96
diff --git a/test/CodeGen/XCore/zextfree.ll b/test/CodeGen/XCore/zextfree.ll
new file mode 100644
index 0000000..48dce88
--- /dev/null
+++ b/test/CodeGen/XCore/zextfree.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=xcore < %s | FileCheck %s
+
+; CHECK-LABEL: test:
+; CHECK-NOT: zext
+define void @test(i8* %s1) {
+entry:
+  %u8 = load i8* %s1, align 1
+  %bool = icmp eq i8 %u8, 0
+  br label %BB1
+BB1:
+  br i1 %bool, label %BB1, label %BB2
+BB2:
+  br i1 %bool, label %BB1, label %BB2
+}
+