47 files changed, 3199 insertions, 257 deletions
diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
new file mode 100644
index 0000000..a31c66b
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -0,0 +1,491 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; This test aims to check basic correctness of frame layout &
+; frame access code. There are 8 functions in this test file,
+; each function implements one element in the cartesian product
+; of:
+; . a function having a VLA/noVLA
+; . a function with dynamic stack realignment/no dynamic stack realignment.
+; . a function needing a frame pionter/no frame pointer,
+; since the presence/absence of these has influence on the frame
+; layout and which pointer to use to access various part of the
+; frame (bp,sp,fp).
+;
+; Furthermore: in every test function:
+; . there is always one integer and 1 floating point argument to be able
+;   to check those are accessed correctly.
+; . there is always one local variable to check that is accessed
+;   correctly
+;
+; The LLVM-IR below was produced by clang on the following C++ code:
+;extern "C" int g();
+;extern "C" int novla_nodynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                             double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  volatile int l1;
+;  return i10 + (int)d10 + l1 + g();
+;}
+;extern "C" int novla_nodynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                             double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  volatile int l1;
+;  return i10 + (int)d10 + l1;
+;}
+;extern "C" int novla_dynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                         double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  alignas(128) volatile int l1;
+;  return i10 + (int)d10 + l1 + g();
+;}
+;extern "C" int novla_dynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                           double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  alignas(128) volatile int l1;
+;  return i10 + (int)d10 + l1;
+;}
+;
+;extern "C" int vla_nodynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                         double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  volatile int l1;
+;  volatile int vla[i1];
+;  return i10 + (int)d10 + l1 + g() + vla[0];
+;}
+;extern "C" int vla_nodynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                           double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  volatile int l1;
+;  volatile int vla[i1];
+;  return i10 + (int)d10 + l1 + vla[0];
+;}
+;extern "C" int vla_dynamicrealign_call(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                       double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  alignas(128) volatile int l1;
+;  volatile int vla[i1];
+;  return i10 + (int)d10 + l1 + g() + vla[0];
+;}
+;extern "C" int vla_dynamicrealign_nocall(int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10,
+;                                         double d1, double d2, double d3, double d4, double d5, double d6, double d7, double d8, double d9, double d10)
+;{
+;  // use an argument passed on the stack.
+;  alignas(128) volatile int l1;
+;  volatile int vla[i1];
+;  return i10 + (int)d10 + l1 + vla[0];
+;}
+
+
+
+define i32 @novla_nodynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+  %l1 = alloca i32, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %call = tail call i32 @g()
+  %add2 = add nsw i32 %add1, %call
+  ret i32 %add2
+}
+; CHECK-LABEL: novla_nodynamicrealign_call
+; CHECK: .cfi_startproc
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x20, x19, [sp, #-32]!
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #16]
+; CHECK: add	x29, sp, #16
+;   Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+;   Check correct access to local variable on the stack, through stack pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [sp, #12]
+;   Check epilogue:
+; CHECK: ldp	x29, x30, [sp, #16]
+; CHECK: ldp	x20, x19, [sp], #32
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+declare i32 @g() #0
+
+; Function Attrs: nounwind
+define i32 @novla_nodynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+  %l1 = alloca i32, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  ret i32 %add1
+}
+; CHECK-LABEL: novla_nodynamicrealign_nocall
+;   Check that space is reserved for one local variable on the stack.
+; CHECK:	sub	sp, sp, #16             // =16
+;   Check correct access to arguments passed on the stack, through stack pointer
+; CHECK: ldr	d[[DARG:[0-9]+]], [sp, #40]
+; CHECK: ldr	w[[IARG:[0-9]+]], [sp, #24]
+;   Check correct access to local variable on the stack, through stack pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [sp, #12]
+;   Check epilogue:
+; CHECK: add	sp, sp, #16             // =16
+; CHECK: ret
+
+
+define i32 @novla_dynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+  %l1 = alloca i32, align 128
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %call = tail call i32 @g()
+  %add2 = add nsw i32 %add1, %call
+  ret i32 %add2
+}
+
+; CHECK-LABEL: novla_dynamicrealign_call
+; CHECK: .cfi_startproc
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x20, x19, [sp, #-32]!
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #16]
+; CHECK: add	x29, sp, #16
+;   Check the dynamic realignment of the stack pointer to a 128-byte boundary
+; CHECK: sub	x9, sp, #96
+; CHECK: and	sp, x9, #0xffffffffffffff80
+;   Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+;   Check correct access to local variable on the stack, through re-aligned stack pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [sp]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: sub	sp, x29, #16            // =16
+; CHECK: ldp	x29, x30, [sp, #16]
+; CHECK: ldp	x20, x19, [sp], #32
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+; Function Attrs: nounwind
+define i32 @novla_dynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+  %l1 = alloca i32, align 128
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  ret i32 %add1
+}
+
+; CHECK-LABEL: novla_dynamicrealign_nocall
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #-16]!
+; CHECK: mov	x29, sp
+;   Check the dynamic realignment of the stack pointer to a 128-byte boundary
+; CHECK: sub	x9, sp, #112
+; CHECK: and	sp, x9, #0xffffffffffffff80
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+;   Check correct access to local variable on the stack, through re-aligned stack pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [sp]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: mov	sp, x29
+; CHECK: ldp	x29, x30, [sp], #16
+; CHECK: ret
+
+
+define i32 @vla_nodynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+  %l1 = alloca i32, align 4
+  %0 = zext i32 %i1 to i64
+  %vla = alloca i32, i64 %0, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %call = tail call i32 @g()
+  %add2 = add nsw i32 %add1, %call
+  %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+  %add3 = add nsw i32 %add2, %1
+  ret i32 %add3
+}
+
+; CHECK-LABEL: vla_nodynamicrealign_call
+; CHECK: .cfi_startproc
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x20, x19, [sp, #-32]!
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #16]
+; CHECK: add	x29, sp, #16
+;   Check that space is reserved on the stack for the local variable,
+;   rounded up to a multiple of 16 to keep the stack pointer 16-byte aligned.
+; CHECK: sub	sp, sp, #16
+;   Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: lsl	x9, x9, #2
+; CHECK: add	x9, x9, #15
+; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: mov	 x10, sp
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov	 sp, x[[VLASPTMP]]
+;   Check correct access to local variable, through frame pointer
+; CHECK: ldur	w[[ILOC:[0-9]+]], [x29, #-20]
+;   Check correct accessing of the VLA variable through the base pointer
+; CHECK: ldr	w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: sub	sp, x29, #16            // =16
+; CHECK: ldp	x29, x30, [sp, #16]
+; CHECK: ldp	x20, x19, [sp], #32
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+; Function Attrs: nounwind
+define i32 @vla_nodynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+  %l1 = alloca i32, align 4
+  %0 = zext i32 %i1 to i64
+  %vla = alloca i32, i64 %0, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 4
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+  %add2 = add nsw i32 %add1, %1
+  ret i32 %add2
+}
+
+; CHECK-LABEL: vla_nodynamicrealign_nocall
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #-16]!
+; CHECK: mov	x29, sp
+;   Check that space is reserved on the stack for the local variable,
+;   rounded up to a multiple of 16 to keep the stack pointer 16-byte aligned.
+; CHECK: sub	sp, sp, #16
+;   Check correctness of cfi pseudo-instructions
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: lsl	x9, x9, #2
+; CHECK: add	x9, x9, #15
+; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: mov	 x10, sp
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov	 sp, x[[VLASPTMP]]
+;   Check correct access to local variable, through frame pointer
+; CHECK: ldur	w[[ILOC:[0-9]+]], [x29, #-4]
+;   Check correct accessing of the VLA variable through the base pointer
+; CHECK: ldr	w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: mov    sp, x29
+; CHECK: ldp	x29, x30, [sp], #16
+; CHECK: ret
+
+
+define i32 @vla_dynamicrealign_call(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #0 {
+entry:
+  %l1 = alloca i32, align 128
+  %0 = zext i32 %i1 to i64
+  %vla = alloca i32, i64 %0, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %call = tail call i32 @g()
+  %add2 = add nsw i32 %add1, %call
+  %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+  %add3 = add nsw i32 %add2, %1
+  ret i32 %add3
+}
+
+; CHECK-LABEL: vla_dynamicrealign_call
+; CHECK: .cfi_startproc
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x22, x21, [sp, #-48]!
+; CHECK: stp	x20, x19, [sp, #16]
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #32]
+; CHECK: add	x29, sp, #32
+;   Check that the stack pointer gets re-aligned to 128
+;   bytes & the base pointer (x19) gets initialized to
+;   this 128-byte aligned area for local variables &
+;   spill slots
+; CHECK: sub	x9, sp, #80            // =80
+; CHECK: and	sp, x9, #0xffffffffffffff80
+; CHECK: mov    x19, sp
+;   Check correctness of cfi pseudo-instructions
+; CHECK: .cfi_def_cfa w29, 16
+; CHECK: .cfi_offset w30, -8
+; CHECK: .cfi_offset w29, -16
+; CHECK: .cfi_offset w19, -24
+; CHECK: .cfi_offset w20, -32
+; CHECK: .cfi_offset w21, -40
+; CHECK: .cfi_offset w22, -48
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+;   and set-up of base pointer (x19).
+; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: lsl	x9, x9, #2
+; CHECK: add	x9, x9, #15
+; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: mov	 x10, sp
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov	 sp, x[[VLASPTMP]]
+;   Check correct access to local variable, through base pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [x19]
+; CHECK: ldr	 w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: sub	sp, x29, #32
+; CHECK: ldp	x29, x30, [sp, #32]
+; CHECK: ldp	x20, x19, [sp, #16]
+; CHECK: ldp	x22, x21, [sp], #48
+; CHECK: ret
+; CHECK: .cfi_endproc
+
+
+; Function Attrs: nounwind
+define i32 @vla_dynamicrealign_nocall(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+  %l1 = alloca i32, align 128
+  %0 = zext i32 %i1 to i64
+  %vla = alloca i32, i64 %0, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 128
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+  %add2 = add nsw i32 %add1, %1
+  ret i32 %add2
+}
+
+; CHECK-LABEL: vla_dynamicrealign_nocall
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x20, x19, [sp, #-32]!
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #16]
+; CHECK: add	x29, sp, #16
+;   Check that the stack pointer gets re-aligned to 128
+;   bytes & the base pointer (x19) gets initialized to
+;   this 128-byte aligned area for local variables &
+;   spill slots
+; CHECK: sub	x9, sp, #96
+; CHECK: and	sp, x9, #0xffffffffffffff80
+; CHECK: mov    x19, sp
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+;   and set-up of base pointer (x19).
+; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: lsl	x9, x9, #2
+; CHECK: add	x9, x9, #15
+; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: mov	 x10, sp
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov	 sp, x[[VLASPTMP]]
+;   Check correct access to local variable, through base pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [x19]
+; CHECK: ldr	 w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: sub	sp, x29, #16
+; CHECK: ldp	x29, x30, [sp, #16]
+; CHECK: ldp	x20, x19, [sp], #32
+; CHECK: ret
+
+
+; Function Attrs: nounwind
+define i32 @vla_dynamicrealign_nocall_large_align(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, i32 %i10, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10) #1 {
+entry:
+  %l1 = alloca i32, align 32768
+  %0 = zext i32 %i1 to i64
+  %vla = alloca i32, i64 %0, align 4
+  %conv = fptosi double %d10 to i32
+  %add = add nsw i32 %conv, %i10
+  %l1.0.l1.0. = load volatile i32, i32* %l1, align 32768
+  %add1 = add nsw i32 %add, %l1.0.l1.0.
+  %1 = load volatile i32, i32* %vla, align 4, !tbaa !1
+  %add2 = add nsw i32 %add1, %1
+  ret i32 %add2
+}
+
+; CHECK-LABEL: vla_dynamicrealign_nocall_large_align
+;   Check that used callee-saved registers are saved
+; CHECK: stp	x20, x19, [sp, #-32]!
+;   Check that the frame pointer is created:
+; CHECK: stp	x29, x30, [sp, #16]
+; CHECK: add	x29, sp, #16
+;   Check that the stack pointer gets re-aligned to 128
+;   bytes & the base pointer (x19) gets initialized to
+;   this 128-byte aligned area for local variables &
+;   spill slots
+; CHECK: sub	x9, sp, #7, lsl #12
+; CHECK: and	sp, x9, #0xffffffffffff8000
+; CHECK: mov    x19, sp
+;   Check correct access to arguments passed on the stack, through frame pointer
+; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
+; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
+;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
+;   and set-up of base pointer (x19).
+; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: lsl	x9, x9, #2
+; CHECK: add	x9, x9, #15
+; CHECK: and	x9, x9, #0xfffffffffffffff0
+; CHECK: mov	 x10, sp
+; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
+; CHECK: mov	 sp, x[[VLASPTMP]]
+;   Check correct access to local variable, through base pointer
+; CHECK: ldr	w[[ILOC:[0-9]+]], [x19]
+; CHECK: ldr	 w[[VLA:[0-9]+]], [x[[VLASPTMP]]]
+;   Check epilogue:
+;     Check that stack pointer get restored from frame pointer.
+; CHECK: sub	sp, x29, #16
+; CHECK: ldp	x29, x30, [sp, #16]
+; CHECK: ldp	x20, x19, [sp], #32
+; CHECK: ret
+
+attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll
index 09b9f62..d6350a6 100644
--- a/test/CodeGen/AArch64/addsub.ll
+++ b/test/CodeGen/AArch64/addsub.ll
@@ -24,6 +24,34 @@ define void @add_small() {
   ret void
 }
 
+; Make sure we grab the imm variant when the register operand
+; can be implicitly zero-extend.
+; We used to generate something horrible like this:
+; wA = ldrb
+; xB = ldimm 12
+; xC = add xB, wA, uxtb
+; whereas this can be achieved with:
+; wA = ldrb
+; xC = add xA, #12 ; <- xA implicitly zero extend wA.
+define void @add_small_imm(i8* %p, i64* %q, i32 %b, i32* %addr) {
+; CHECK-LABEL: add_small_imm:
+entry:
+
+; CHECK: ldrb w[[LOAD32:[0-9]+]], [x0]
+  %t = load i8, i8* %p
+  %promoted = zext i8 %t to i64
+  %zextt = zext i8 %t to i32
+  %add = add nuw i32 %zextt, %b
+
+; CHECK: add [[ADD2:x[0-9]+]], x[[LOAD32]], #12
+  %add2 = add nuw i64 %promoted, 12
+  store i32 %add, i32* %addr
+
+; CHECK: str [[ADD2]], [x1]
+  store i64 %add2, i64* %q
+  ret void
+}
+
 ; Add 12-bit immediates, shifted left by 12 bits
 define void @add_med() {
 ; CHECK-LABEL: add_med:
diff --git a/test/CodeGen/AArch64/argument-blocks.ll b/test/CodeGen/AArch64/argument-blocks.ll
index f1dcfa6..3169abc 100644
--- a/test/CodeGen/AArch64/argument-blocks.ll
+++ b/test/CodeGen/AArch64/argument-blocks.ll
@@ -64,7 +64,7 @@ define void @test_varargs_stackalign() {
 ; CHECK-LABEL: test_varargs_stackalign:
 ; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16]
 
-  call void(...)* @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
+  call void(...) @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
index 41e22e9..b760261 100644
--- a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
+++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
@@ -16,11 +16,11 @@ entry:
   %0 = load double, double* %d.addr, align 8
   %1 = load double, double* %d.addr, align 8
   %conv = fptoui double %1 to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
   %2 = load double, double* %d.addr, align 8
   %3 = load double, double* %d.addr, align 8
   %conv1 = fptoui double %3 to i32
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
+  %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
   ret void
 }
 
@@ -37,12 +37,12 @@ entry:
   %conv = fpext float %0 to double
   %1 = load float, float* %f.addr, align 4
   %conv1 = fptoui float %1 to i64
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
   %2 = load float, float* %f.addr, align 4
   %conv2 = fpext float %2 to double
   %3 = load float, float* %f.addr, align 4
   %conv3 = fptoui float %3 to i32
-  %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
+  %call4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 6266d1c..8784abd 100644
--- a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
+++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -7,13 +7,13 @@ define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: bar:
 ; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
 ; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
 ; Without advanced copy optimization, we end up with cross register
 ; banks copies that cannot be coalesced.
 ; CHECK-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
 ; With advanced copy optimization, we end up with just one copy
 ; to insert the computed high part into the V register. 
 ; CHECK-OPT-NOT: fmov
-; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
 ; CHECK: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
 ; CHECK-NOOPT: fmov d0, [[COPY_REG3]]
 ; CHECK-OPT-NOT: fmov
@@ -23,9 +23,9 @@ define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; GENERIC-LABEL: bar:
 ; GENERIC: add	v[[REG:[0-9]+]].2d, v0.2d, v1.2d
 ; GENERIC: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; GENERIC: sub	d[[REG2:[0-9]+]], d[[REG]], d1
 ; GENERIC-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
 ; GENERIC-OPT-NOT: fmov
-; GENERIC: sub	d[[REG2:[0-9]+]], d[[REG]], d1
 ; GENERIC: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
 ; GENERIC-NOOPT: fmov d0, [[COPY_REG3]]
 ; GENERIC-OPT-NOT: fmov
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
index 41c3ad5..390a3c7 100644
--- a/test/CodeGen/AArch64/arm64-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -78,7 +78,7 @@ declare void @variadic(i32 %a, ...)
   ; Under AAPCS variadic functions have the same calling convention as
   ; others. The extra arguments should go in registers rather than on the stack.
 define void @test_variadic() {
-  call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
+  call void(i32, ...) @variadic(i32 0, i64 1, double 2.0)
 ; CHECK: fmov d0, #2.0
 ; CHECK: orr w1, wzr, #0x1
 ; CHECK: bl variadic
diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
index f95fec6..03414b5 100644
--- a/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -94,7 +94,7 @@ define i32 @main() nounwind ssp {
   %10 = load i32, i32* %a10, align 4
   %11 = load i32, i32* %a11, align 4
   %12 = load i32, i32* %a12, align 4
-  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
   ret i32 0
 }
 
@@ -133,7 +133,7 @@ entry:
   store <4 x i32> %y, <4 x i32>* %y.addr, align 16
   %0 = load i32, i32* %x.addr, align 4
   %1 = load <4 x i32>, <4 x i32>* %y.addr, align 16
-  call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
+  call void (i8*, ...) @foo(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
   ret void
 }
 
@@ -186,6 +186,6 @@ entry:
   %1 = load i32, i32* %x.addr, align 4
   %2 = bitcast %struct.s41* %s41 to i128*
   %3 = load i128, i128* %2, align 1
-  call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
+  call void (i8*, ...) @foo2(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
   ret void
 }
diff --git a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
index 241cf97..56c62d5 100644
--- a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
+++ b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
@@ -8,7 +8,7 @@ define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i6
                         i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
                         i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
 entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
                 i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
                 i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
                 i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
diff --git a/test/CodeGen/AArch64/arm64-anyregcc.ll b/test/CodeGen/AArch64/arm64-anyregcc.ll
index e26875d..2a2f451 100644
--- a/test/CodeGen/AArch64/arm64-anyregcc.ll
+++ b/test/CodeGen/AArch64/arm64-anyregcc.ll
@@ -55,7 +55,7 @@
 ; CHECK-NEXT:   .long 3
 define i64 @test() nounwind ssp uwtable {
 entry:
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
   ret i64 0
 }
 
@@ -77,7 +77,7 @@ entry:
 define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
   ret i64 %ret
 }
 
@@ -100,7 +100,7 @@ define i64 @property_access2() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
   ret i64 %ret
 }
 
@@ -123,7 +123,7 @@ define i64 @property_access3() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
   ret i64 %ret
 }
 
@@ -205,7 +205,7 @@ entry:
 define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -287,7 +287,7 @@ entry:
 define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 281474417671919 to i8*
-  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -315,7 +315,7 @@ entry:
 ; CHECK-NEXT: .long  0
 define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
   tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
   ret i64 %result
 }
@@ -355,7 +355,7 @@ entry:
 define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
   tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
index c280bef..d089767 100644
--- a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
+++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
@@ -1,6 +1,10 @@
 ; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
 ; RUN: llc -mtriple aarch64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
 
+; Note, we split the functions in to multiple BBs below to isolate the call
+; instruction we want to test, from fast-isel failing to select instructions
+; after it.
+
 ; CHECK-LABEL: test_i64_f64:
 declare i64 @test_i64_f64_helper(double %p)
 define void @test_i64_f64(double* %p, i64* %q) {
@@ -8,6 +12,8 @@ define void @test_i64_f64(double* %p, i64* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call i64 @test_i64_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -20,6 +26,8 @@ define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -32,6 +40,8 @@ define void @test_i64_v2f32(<2 x float>* %p, i64* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call i64 @test_i64_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -44,6 +54,8 @@ define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -56,6 +68,8 @@ define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -68,6 +82,8 @@ define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add i64 %3, %3
     store i64 %4, i64* %q
     ret void
@@ -80,6 +96,8 @@ define void @test_f64_i64(i64* %p, double* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call double @test_f64_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -92,6 +110,8 @@ define void @test_f64_v1i64(<1 x i64>* %p, double* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call double @test_f64_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -104,6 +124,8 @@ define void @test_f64_v2f32(<2 x float>* %p, double* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call double @test_f64_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -116,6 +138,8 @@ define void @test_f64_v2i32(<2 x i32>* %p, double* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call double @test_f64_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -128,6 +152,8 @@ define void @test_f64_v4i16(<4 x i16>* %p, double* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call double @test_f64_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -140,6 +166,8 @@ define void @test_f64_v8i8(<8 x i8>* %p, double* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call double @test_f64_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd double %3, %3
     store double %4, double* %q
     ret void
@@ -152,6 +180,8 @@ define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -164,6 +194,8 @@ define void @test_v1i64_f64(double* %p, <1 x i64>* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call <1 x i64> @test_v1i64_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -176,6 +208,8 @@ define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -188,6 +222,8 @@ define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -200,6 +236,8 @@ define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -212,6 +250,8 @@ define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <1 x i64> %3, %3
     store <1 x i64> %4, <1 x i64>* %q
     ret void
@@ -224,6 +264,8 @@ define void @test_v2f32_i64(i64* %p, <2 x float>* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call <2 x float> @test_v2f32_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -236,6 +278,8 @@ define void @test_v2f32_f64(double* %p, <2 x float>* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call <2 x float> @test_v2f32_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -248,6 +292,8 @@ define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -261,6 +307,8 @@ define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -274,6 +322,8 @@ define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -287,6 +337,8 @@ define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x float> %3, %3
     store <2 x float> %4, <2 x float>* %q
     ret void
@@ -299,6 +351,8 @@ define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -311,6 +365,8 @@ define void @test_v2i32_f64(double* %p, <2 x i32>* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call <2 x i32> @test_v2i32_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -323,6 +379,8 @@ define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -336,6 +394,8 @@ define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -349,6 +409,8 @@ define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -362,6 +424,8 @@ define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i32> %3, %3
     store <2 x i32> %4, <2 x i32>* %q
     ret void
@@ -374,6 +438,8 @@ define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -386,6 +452,8 @@ define void @test_v4i16_f64(double* %p, <4 x i16>* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call <4 x i16> @test_v4i16_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -398,6 +466,8 @@ define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -411,6 +481,8 @@ define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -424,6 +496,8 @@ define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -437,6 +511,8 @@ define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) {
     %1 = load <8 x i8>, <8 x i8>* %p
     %2 = add <8 x i8> %1, %1
     %3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i16> %3, %3
     store <4 x i16> %4, <4 x i16>* %q
     ret void
@@ -449,6 +525,8 @@ define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) {
     %1 = load i64, i64* %p
     %2 = add i64 %1, %1
     %3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -461,6 +539,8 @@ define void @test_v8i8_f64(double* %p, <8 x i8>* %q) {
     %1 = load double, double* %p
     %2 = fadd double %1, %1
     %3 = call <8 x i8> @test_v8i8_f64_helper(double %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -473,6 +553,8 @@ define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) {
     %1 = load <1 x i64>, <1 x i64>* %p
     %2 = add <1 x i64> %1, %1
     %3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -486,6 +568,8 @@ define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) {
     %1 = load <2 x float>, <2 x float>* %p
     %2 = fadd <2 x float> %1, %1
     %3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -499,6 +583,8 @@ define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) {
     %1 = load <2 x i32>, <2 x i32>* %p
     %2 = add <2 x i32> %1, %1
     %3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -512,6 +598,8 @@ define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) {
     %1 = load <4 x i16>, <4 x i16>* %p
     %2 = add <4 x i16> %1, %1
     %3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i8> %3, %3
     store <8 x i8> %4, <8 x i8>* %q
     ret void
@@ -524,6 +612,8 @@ define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -536,6 +626,8 @@ define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -549,6 +641,8 @@ define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -562,6 +656,8 @@ define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -575,6 +671,8 @@ define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -588,6 +686,8 @@ define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd fp128 %3, %3
     store fp128 %4, fp128* %q
     ret void
@@ -600,6 +700,8 @@ define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -613,6 +715,8 @@ define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -627,6 +731,8 @@ define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -641,6 +747,8 @@ define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -655,6 +763,8 @@ define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -669,6 +779,8 @@ define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <2 x double> %3, %3
     store <2 x double> %4, <2 x double>* %q
     ret void
@@ -681,6 +793,8 @@ define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -694,6 +808,8 @@ define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -708,6 +824,8 @@ define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -722,6 +840,8 @@ define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -736,6 +856,8 @@ define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -750,6 +872,8 @@ define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <2 x i64> %3, %3
     store <2 x i64> %4, <2 x i64>* %q
     ret void
@@ -763,6 +887,8 @@ define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -777,6 +903,8 @@ define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -791,6 +919,8 @@ define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -806,6 +936,8 @@ define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -821,6 +953,8 @@ define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -836,6 +970,8 @@ define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = fadd <4 x float> %3, %3
     store <4 x float> %4, <4 x float>* %q
     ret void
@@ -849,6 +985,8 @@ define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -863,6 +1001,8 @@ define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -877,6 +1017,8 @@ define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -892,6 +1034,8 @@ define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -907,6 +1051,8 @@ define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -922,6 +1068,8 @@ define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <4 x i32> %3, %3
     store <4 x i32> %4, <4 x i32>* %q
     ret void
@@ -935,6 +1083,8 @@ define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -949,6 +1099,8 @@ define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -963,6 +1115,8 @@ define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -978,6 +1132,8 @@ define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -993,6 +1149,8 @@ define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -1008,6 +1166,8 @@ define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) {
     %1 = load <16 x i8>, <16 x i8>* %p
     %2 = add <16 x i8> %1, %1
     %3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2)
+    br label %return_bb
+return_bb:
     %4 = add <8 x i16> %3, %3
     store <8 x i16> %4, <8 x i16>* %q
     ret void
@@ -1021,6 +1181,8 @@ define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) {
     %1 = load fp128, fp128* %p
     %2 = fadd fp128 %1, %1
     %3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
@@ -1035,6 +1197,8 @@ define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) {
     %1 = load <2 x double>, <2 x double>* %p
     %2 = fadd <2 x double> %1, %1
     %3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
@@ -1049,6 +1213,8 @@ define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) {
     %1 = load <2 x i64>, <2 x i64>* %p
     %2 = add <2 x i64> %1, %1
     %3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
@@ -1064,6 +1230,8 @@ define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) {
     %1 = load <4 x float>, <4 x float>* %p
     %2 = fadd <4 x float> %1, %1
     %3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
@@ -1079,6 +1247,8 @@ define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) {
     %1 = load <4 x i32>, <4 x i32>* %p
     %2 = add <4 x i32> %1, %1
     %3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
@@ -1094,6 +1264,8 @@ define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) {
     %1 = load <8 x i16>, <8 x i16>* %p
     %2 = add <8 x i16> %1, %1
     %3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2)
+    br label %return_bb
+return_bb:
     %4 = add <16 x i8> %3, %3
     store <16 x i8> %4, <16 x i8>* %q
     ret void
diff --git a/test/CodeGen/AArch64/arm64-call-tailcalls.ll b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
index 71d9327..6621db2 100644
--- a/test/CodeGen/AArch64/arm64-call-tailcalls.ll
+++ b/test/CodeGen/AArch64/arm64-call-tailcalls.ll
@@ -53,9 +53,9 @@ bb:                                               ; preds = %entry
 
 define i32 @t8(i32 %x) nounwind ssp {
 ; CHECK-LABEL: t8:
+; CHECK: b	_c
 ; CHECK: b	_a
 ; CHECK: b	_b
-; CHECK: b	_c
   %and = and i32 %x, 1
   %tobool = icmp eq i32 %and, 0
   br i1 %tobool, label %if.end, label %if.then
diff --git a/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
new file mode 100644
index 0000000..f0b8299
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
@@ -0,0 +1,638 @@
+; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS
+; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S -stress-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=STRESS
+; RUN: opt -codegenprepare < %s -mtriple=aarch64-apple-ios -S -disable-cgp-ext-ld-promotion | FileCheck %s --check-prefix=OPTALL --check-prefix=DISABLE
+
+; CodeGenPrepare should move the zext into the block with the load
+; so that SelectionDAG can select it with the load.
+;
+; OPTALL-LABEL: @foo
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+; OPTALL-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; OPTALL: store i32 [[ZEXT]], i32* %q
+; OPTALL: ret
+define void @foo(i8* %p, i32* %q) {
+entry:
+  %t = load i8, i8* %p
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i8 %t to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a zextload is an operation with only one
+; argument to explicitly extend is in the the way.
+; OPTALL-LABEL: @promoteOneArg
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+; OPT-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT]], 2
+; Make sure the operation is not promoted when the promotion pass is disabled.
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], 2
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteOneArg(i8* %p, i32* %q) {
+entry:
+  %t = load i8, i8* %p
+  %add = add nuw i8 %t, 2
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a sextload is an operation with only one
+; argument to explicitly extend is in the the way.
+; Version with sext.
+; OPTALL-LABEL: @promoteOneArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXT]], 2
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], 2
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteOneArgSExt(i8* %p, i32* %q) {
+entry:
+  %t = load i8, i8* %p
+  %add = add nsw i8 %t, 2
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = sext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a zextload is an operation with two
+; arguments to explicitly extend is in the the way.
+; Extending %add will create two extensions:
+; 1. One for %b.
+; 2. One for %t.
+; #1 will not be removed as we do not know anything about %b.
+; #2 may not be merged with the load because %t is used in a comparison.
+; Since two extensions may be emitted in the end instead of one before the
+; transformation, the regular heuristic does not apply the optimization. 
+; 
+; OPTALL-LABEL: @promoteTwoArgZext
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteTwoArgZext(i8* %p, i32* %q, i8 %b) {
+entry:
+  %t = load i8, i8* %p
+  %add = add nuw i8 %t, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a sextload is an operation with two
+; arguments to explicitly extend is in the the way.
+; Version with sext.
+; OPTALL-LABEL: @promoteTwoArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[SEXTLD:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i8 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXTLD]], [[SEXTB]]
+;
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteTwoArgSExt(i8* %p, i32* %q, i8 %b) {
+entry:
+  %t = load i8, i8* %p
+  %add = add nsw i8 %t, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = sext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we do not a zextload if we need to introduce more than
+; one additional extension.
+; OPTALL-LABEL: @promoteThreeArgZext
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
+; STRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+; STRESS-NEXT: [[ZEXTC:%[a-zA-Z_0-9-]+]] = zext i8 %c to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[TMP]], [[ZEXTC]]
+;
+; NONSTRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
+; NONSTRESS-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[TMP]], %c
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; DISABLE: add nuw i8
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteThreeArgZext(i8* %p, i32* %q, i8 %b, i8 %c) {
+entry:
+  %t = load i8, i8* %p
+  %tmp = add nuw i8 %t, %b
+  %add = add nuw i8 %tmp, %c
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i8 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a zextload after promoting and merging
+; two extensions.
+; OPTALL-LABEL: @promoteMergeExtArgZExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i16 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
+;
+; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
+;
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteMergeExtArgZExt(i8* %p, i32* %q, i16 %b) {
+entry:
+  %t = load i8, i8* %p
+  %ext = zext i8 %t to i16
+  %add = add nuw i16 %ext, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = zext i16 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to form a sextload after promoting and merging
+; two extensions.
+; Version with sext.
+; OPTALL-LABEL: @promoteMergeExtArgSExt
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = sext i16 %b to i32
+; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXTLD]], [[ZEXTB]]
+;
+; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
+; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+;
+; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+; OPTALL: store i32 [[RES]], i32* %q
+; OPTALL: ret
+define void @promoteMergeExtArgSExt(i8* %p, i32* %q, i16 %b) {
+entry:
+  %t = load i8, i8* %p
+  %ext = zext i8 %t to i16
+  %add = add nsw i16 %ext, %b
+  %a = icmp slt i8 %t, 20
+  br i1 %a, label %true, label %false
+true:
+  %s = sext i16 %add to i32
+  store i32 %s, i32* %q
+  ret void
+false:
+  ret void
+}
+
+; Check that we manage to catch all the extload opportunities that are exposed
+; by the different iterations of codegen prepare.
+; Moreover, check that we do not promote more than we need to.
+; Here is what is happening in this test (not necessarly in this order):
+; 1. We try to promote the operand of %sextadd.
+;    a. This creates one sext of %ld2 and one of %zextld
+;    b. The sext of %ld2 can be combine with %ld2, so we remove one sext but
+;       introduced one. This is fine with the current heuristic: neutral.
+;    => We have one zext of %zextld left and we created one sext of %ld2.
+; 2. We try to promote the operand of %sextaddza.
+;    a. This creates one sext of %zexta and one of %zextld
+;    b. The sext of %zexta does not lead to any load, it stays here, even if it
+;       could have been combine with the zext of %a.
+;    c. The sext of %zextld leads to %ld and can be combined with it. This is
+;       done by promoting %zextld. This is fine with the current heuristic:
+;       neutral.
+;    => We have created a new zext of %ld and we created one sext of %zexta.
+; 3. We try to promote the operand of %sextaddb.
+;    a. This creates one sext of %b and one of %zextld
+;    b. The sext of %b is a dead-end, nothing to be done.
+;    c. Same thing as 2.c. happens.
+;    => We have created a new zext of %ld and we created one sext of %b.
+; 4. We try to promote the operand of the zext of %zextld introduced in #1.
+;    a. Same thing as 2.c. happens.
+;    b. %zextld does not have any other uses. It is dead coded.
+;    => We have created a new zext of %ld and we removed a zext of %zextld and
+;       a zext of %ld.
+; Currently we do not try to reuse existing extensions, so in the end we have
+; 3 identical zext of %ld. The extensions will be CSE'ed by SDag.
+;
+; OPTALL-LABEL: @severalPromotions
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %addr1
+; OPT-NEXT: [[ZEXTLD1_1:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTLD1_2:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[LD2:%[a-zA-Z_0-9-]+]] = load i32, i32* %addr2
+; OPT-NEXT: [[SEXTLD2:%[a-zA-Z_0-9-]+]] = sext i32 [[LD2]] to i64
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_1]]
+; We do not combine this one: see 2.b.
+; OPT-NEXT: [[ZEXTA:%[a-zA-Z_0-9-]+]] = zext i8 %a to i32
+; OPT-NEXT: [[SEXTZEXTA:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXTA]] to i64
+; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTZEXTA]], [[ZEXTLD1_3]]
+; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_2]]
+;
+; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RES:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADD]] to i64
+; DISABLE: [[ADDZA:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RESZA:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADDZA]] to i64
+; DISABLE: [[ADDB:%[a-zA-Z_0-9-]+]] = add nsw i32
+; DISABLE: [[RESB:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADDB]] to i64
+;
+; OPTALL: call void @dummy(i64 [[RES]], i64 [[RESZA]], i64 [[RESB]])
+; OPTALL: ret
+define void @severalPromotions(i8* %addr1, i32* %addr2, i8 %a, i32 %b) {
+  %ld = load i8, i8* %addr1
+  %zextld = zext i8 %ld to i32
+  %ld2 = load i32, i32* %addr2
+  %add = add nsw i32 %ld2, %zextld
+  %sextadd = sext i32 %add to i64
+  %zexta = zext i8 %a to i32
+  %addza = add nsw i32 %zexta, %zextld
+  %sextaddza = sext i32 %addza to i64
+  %addb = add nsw i32 %b, %zextld
+  %sextaddb = sext i32 %addb to i64
+  call void @dummy(i64 %sextadd, i64 %sextaddza, i64 %sextaddb)
+  ret void
+}
+
+declare void @dummy(i64, i64, i64)
+
+; Make sure we do not try to promote vector types since the type promotion
+; helper does not support them for now.
+; OPTALL-LABEL: @vectorPromotion
+; OPTALL: [[SHL:%[a-zA-Z_0-9-]+]] = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
+; OPTALL: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext <2 x i32> [[SHL]] to <2 x i64>
+; OPTALL: ret
+define void @vectorPromotion() {
+entry:
+  %a = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
+  %b = zext <2 x i32> %a to <2 x i64>
+  ret void
+}
+
+@a = common global i32 0, align 4
+@c = common global [2 x i32] zeroinitializer, align 4
+
+; Make sure we support promotion of operands that produces a Value as opposed
+; to an instruction.
+; This used to cause a crash.
+; OPTALL-LABEL: @promotionOfArgEndsUpInValue
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i16, i16* %addr
+;
+; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i16 [[LD]] to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw nsw i32 [[SEXT]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i32)
+;
+; DISABLE-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw nsw i16 [[LD]], zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i16)
+; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+;
+; OPTALL-NEXT: ret i32 [[RES]]
+define i32 @promotionOfArgEndsUpInValue(i16* %addr) {
+entry:
+  %val = load i16, i16* %addr
+  %add = add nuw nsw i16 %val, zext (i1 icmp ne (i32* getelementptr inbounds ([2 x i32], [2 x i32]* @c, i64 0, i64 1), i32* @a) to i16)
+  %conv3 = sext i16 %add to i32
+  ret i32 %conv3
+}
+
+; Check that we see that one zext can be derived from the other for free.
+; OPTALL-LABEL: @promoteTwoArgZextWithSourceExtendedTwice
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; OPT-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; OPT-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], 12
+; OPT-NEXT: store i32 [[RES32]], i32* %addr
+; OPT-NEXT: store i64 [[RES64]], i64* %q
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[RES2_32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], 12
+; DISABLE-NEXT: store i32 [[RES32]], i32* %addr
+; DISABLE-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES2_32]] to i64
+; DISABLE-NEXT: store i64 [[ZEXT64]], i64* %q
+;
+; OPTALL-NEXT: ret void
+define void @promoteTwoArgZextWithSourceExtendedTwice(i8* %p, i64* %q, i32 %b, i32* %addr) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nuw i32 %zextt, %b
+  %add2 = add nuw i32 %zextt, 12
+  store i32 %add, i32 *%addr
+  %s = zext i32 %add2 to i64
+  store i64 %s, i64* %q
+  ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+; STRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = trunc i64 [[IDX64]] to i32
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i32 [[RES32]], i32* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeSExtFromAddrMode(i8* %p, i32 %b, i32* %addr) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nsw i32 %zextt, %b
+  %idx64 = sext i32 %add to i64
+  %staddr = getelementptr inbounds i32, i32* %addr, i64 %idx64
+  store i32 %add, i32 *%staddr
+  ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode64
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i64, i64* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i64 %stuff, i64* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeSExtFromAddrMode64(i8* %p, i32 %b, i64* %addr, i64 %stuff) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nsw i32 %zextt, %b
+  %idx64 = sext i32 %add to i64
+  %staddr = getelementptr inbounds i64, i64* %addr, i64 %idx64
+  store i64 %stuff, i64 *%staddr
+  ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromAddrMode128
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i128, i128* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i128 %stuff, i128* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeSExtFromAddrMode128(i8* %p, i32 %b, i128* %addr, i128 %stuff) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nsw i32 %zextt, %b
+  %idx64 = sext i32 %add to i64
+  %staddr = getelementptr inbounds i128, i128* %addr, i64 %idx64
+  store i128 %stuff, i128 *%staddr
+  ret void
+}
+
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free sext. If we would have promoted
+; all the way through the load we would end up with a free zext and a
+; non-free sext (of %b).
+; OPTALL-LABEL: @promoteSExtFromAddrMode256
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i256, i256* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i256 %stuff, i256* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @promoteSExtFromAddrMode256(i8* %p, i32 %b, i256* %addr, i256 %stuff) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nsw i32 %zextt, %b
+  %idx64 = sext i32 %add to i64
+  %staddr = getelementptr inbounds i256, i256* %addr, i64 %idx64
+  store i256 %stuff, i256 *%staddr
+  ret void
+}
+
+; Check that we do not increase the cost of the code.
+; The input has one free zext and one free zext.
+; When we promote all the way through the load, we end up with
+; a free zext and a non-free zext (of %b).
+; However, the current target lowering says zext i32 to i64 is free
+; so the promotion happens because the cost did not change and may
+; expose more opportunities.
+; This would need to be fixed at some point.
+; OPTALL-LABEL: @doNotPromoteFreeZExtFromAddrMode
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; This transformation should really happen only for stress mode.
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i32 %b to i64
+; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], [[ZEXTB]]
+; OPT-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = trunc i64 [[IDX64]] to i32
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %addr, i64 [[IDX64]]
+; OPTALL-NEXT: store i32 [[RES32]], i32* [[GEP]]
+; OPTALL-NEXT: ret void
+define void @doNotPromoteFreeZExtFromAddrMode(i8* %p, i32 %b, i32* %addr) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nuw i32 %zextt, %b
+  %idx64 = zext i32 %add to i64
+  %staddr = getelementptr inbounds i32, i32* %addr, i64 %idx64
+  store i32 %add, i32 *%staddr
+  ret void
+}
+
+; OPTALL-LABEL: @doNotPromoteFreeSExtFromShift
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; STRESS-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
+; STRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXT64]], [[SEXTB]]
+;
+; NONSTRESS-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; NONSTRESS-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; NONSTRESS-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = sext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = shl i64 [[IDX64]], 12
+; OPTALL-NEXT: ret i64 %staddr
+define i64 @doNotPromoteFreeSExtFromShift(i8* %p, i32 %b) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nsw i32 %zextt, %b
+  %idx64 = sext i32 %add to i64
+  %staddr = shl i64 %idx64, 12
+  ret i64 %staddr
+}
+
+; Same comment as doNotPromoteFreeZExtFromAddrMode.
+; OPTALL-LABEL: @doNotPromoteFreeZExtFromShift
+; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %p
+;
+; This transformation should really happen only for stress mode.
+; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i32 %b to i64
+; OPT-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], [[ZEXTB]]
+;
+; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
+; DISABLE-NEXT: [[IDX64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES32]] to i64
+;
+; OPTALL-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = shl i64 [[IDX64]], 12
+; OPTALL-NEXT: ret i64 %staddr
+define i64 @doNotPromoteFreeZExtFromShift(i8* %p, i32 %b) {
+entry:
+  %t = load i8, i8* %p
+  %zextt = zext i8 %t to i32
+  %add = add nuw i32 %zextt, %b
+  %idx64 = zext i32 %add to i64
+  %staddr = shl i64 %idx64, 12
+  ret i64 %staddr
+}
+
+; The input has one free zext and one non-free sext.
+; When we promote all the way through to the load, we end up with
+; a free zext, a free sext (%ld1), and a non-free sext (of %cst).
+; However, we when generate load pair and the free sext(%ld1) becomes
+; non-free. So technically, we trade a non-free sext to two non-free
+; sext.
+; This would need to be fixed at some point.
+; OPTALL-LABEL: @doNotPromoteBecauseOfPairedLoad
+; OPTALL: [[LD0:%[a-zA-Z_0-9-]+]] = load i32, i32* %p
+; OPTALL: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i32, i32* %p, i64 1
+; OPTALL: [[LD1:%[a-zA-Z_0-9-]+]] = load i32, i32* [[GEP]]
+;
+; This transformation should really happen only for stress mode.
+; OPT-NEXT: [[SEXTLD1:%[a-zA-Z_0-9-]+]] = sext i32 [[LD1]] to i64
+; OPT-NEXT: [[SEXTCST:%[a-zA-Z_0-9-]+]] = sext i32 %cst to i64
+; OPT-NEXT: [[SEXTRES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD1]], [[SEXTCST]]
+;
+; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[LD1]], %cst
+; DISABLE-NEXT: [[SEXTRES:%[a-zA-Z_0-9-]+]] = sext i32 [[RES]] to i64
+;
+; OPTALL-NEXT: [[ZEXTLD0:%[a-zA-Z_0-9-]+]] = zext i32 [[LD0]] to i64
+; OPTALL-NEXT: [[FINAL:%[a-zA-Z_0-9-]+]] = add i64 [[SEXTRES]], [[ZEXTLD0]]
+; OPTALL-NEXT: ret i64 [[FINAL]]
+define i64 @doNotPromoteBecauseOfPairedLoad(i32* %p, i32 %cst) {
+  %ld0 = load i32, i32* %p
+  %idxLd1 = getelementptr inbounds i32, i32* %p, i64 1
+  %ld1 = load i32, i32* %idxLd1
+  %res = add nsw i32 %ld1, %cst
+  %sextres = sext i32 %res to i64
+  %zextLd0 = zext i32 %ld0 to i64
+  %final = add i64 %sextres, %zextLd0
+  ret i64 %final
+}
diff --git a/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index c6b7d83..c4e3e4e 100644
--- a/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -3,11 +3,11 @@
 
 define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
 ; CHECK: fptosi_v4f64_to_v4i16
-; CHECK-DAG: fcvtzs  v[[LHS:[0-9]+]].2d, v1.2d
-; CHECK-DAG: fcvtzs  v[[RHS:[0-9]+]].2d, v0.2d
-; CHECK-DAG: xtn  v[[LHS_NA:[0-9]+]].2s, v[[LHS]].2d
-; CHECK-DAG: xtn  v[[RHS_NA:[0-9]+]].2s, v[[RHS]].2d
-; CHECK:     uzp1  v0.4h, v[[RHS_NA]].4h, v[[LHS_NA]].4h
+; CHECK-DAG: fcvtzs  v[[LHS:[0-9]+]].2d, v0.2d
+; CHECK-DAG: fcvtzs  v[[RHS:[0-9]+]].2d, v1.2d
+; CHECK-DAG: xtn  v[[MID:[0-9]+]].2s, v[[LHS]].2d
+; CHECK-DAG: xtn2  v[[MID]].4s, v[[RHS]].2d
+; CHECK:     xtn  v0.4h, v[[MID]].4s
   %tmp1 = load <4 x double>, <4 x double>* %ptr
   %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
   ret <4 x i16> %tmp2
@@ -15,17 +15,17 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(<4 x double>* %ptr) {
 
 define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) {
 ; CHECK: fptosi_v4f64_to_v4i8
-; CHECK-DAG:  fcvtzs  v[[CONV3:[0-9]+]].2d, v3.2d
-; CHECK-DAG:  fcvtzs  v[[CONV2:[0-9]+]].2d, v2.2d
-; CHECK-DAG:  fcvtzs  v[[CONV1:[0-9]+]].2d, v1.2d
 ; CHECK-DAG:  fcvtzs  v[[CONV0:[0-9]+]].2d, v0.2d
-; CHECK-DAG:  xtn  v[[NA3:[0-9]+]].2s, v[[CONV3]].2d
+; CHECK-DAG:  fcvtzs  v[[CONV1:[0-9]+]].2d, v1.2d
+; CHECK-DAG:  fcvtzs  v[[CONV2:[0-9]+]].2d, v2.2d
+; CHECK-DAG:  fcvtzs  v[[CONV3:[0-9]+]].2d, v3.2d
 ; CHECK-DAG:  xtn  v[[NA2:[0-9]+]].2s, v[[CONV2]].2d
-; CHECK-DAG:  xtn  v[[NA1:[0-9]+]].2s, v[[CONV1]].2d
+; CHECK-DAG:  xtn2  v[[NA2]].4s, v[[CONV3]].2d
 ; CHECK-DAG:  xtn  v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
-; CHECK-DAG:  uzp1  v[[TMP1:[0-9]+]].4h, v[[CONV2]].4h, v[[CONV3]].4h
-; CHECK-DAG:  uzp1  v[[TMP2:[0-9]+]].4h, v[[CONV0]].4h, v[[CONV1]].4h
-; CHECK:      uzp1  v0.8b, v[[TMP2]].8b, v[[TMP1]].8b
+; CHECK-DAG:  xtn2  v[[NA0]].4s, v[[CONV1]].2d
+; CHECK-DAG:  xtn  v[[TMP1:[0-9]+]].4h, v[[NA0]].4s
+; CHECK-DAG:  xtn2  v[[TMP1]].8h, v[[NA2]].4s
+; CHECK:      xtn  v0.8b, v[[TMP1]].8h
   %tmp1 = load <8 x double>, <8 x double>* %ptr
   %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
   ret <8 x i8> %tmp2
diff --git a/test/CodeGen/AArch64/arm64-dup.ll b/test/CodeGen/AArch64/arm64-dup.ll
index 849e227..c6b7de3 100644
--- a/test/CodeGen/AArch64/arm64-dup.ll
+++ b/test/CodeGen/AArch64/arm64-dup.ll
@@ -321,3 +321,40 @@ entry:
   %sub = sub <4 x i16> %a, %mul
   ret <4 x i16> %sub
 }
+
+; Also test the DUP path in the PerfectShuffle generator.
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4i16:
+; CHECK-NEXT: dup.4h v0, v0[0]
+; CHECK-NEXT: ext.8b v0, v0, v1, #4
+define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+  %r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+  ret <4 x i16> %r
+}
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4f16:
+; CHECK-NEXT: dup.4h v0, v0[0]
+; CHECK-NEXT: ext.8b v0, v0, v1, #4
+; CHECK-NEXT: ret
+define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
+  %r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+  ret <4 x half> %r
+}
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4i32:
+; CHECK-NEXT: dup.4s v0, v0[0]
+; CHECK-NEXT: ext.16b v0, v0, v1, #8
+; CHECK-NEXT: ret
+define <4 x i32> @test_perfectshuffle_dupext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+  %r = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+  ret <4 x i32> %r
+}
+
+; CHECK-LABEL: test_perfectshuffle_dupext_v4f32:
+; CHECK-NEXT: dup.4s v0, v0[0]
+; CHECK-NEXT: ext.16b v0, v0, v1, #8
+; CHECK-NEXT: ret
+define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float> %b) nounwind {
+  %r = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
+  ret <4 x float> %r
+}
diff --git a/test/CodeGen/AArch64/arm64-fcopysign.ll b/test/CodeGen/AArch64/arm64-fcopysign.ll
index 66241df..feffd41 100644
--- a/test/CodeGen/AArch64/arm64-fcopysign.ll
+++ b/test/CodeGen/AArch64/arm64-fcopysign.ll
@@ -39,7 +39,7 @@ entry:
 ; CHECK: fcvt s0, d0
 ; CHECK: movi.4s v[[CONST:[0-9]+]], #0x80, lsl #24
 ; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
-  %0 = tail call double (...)* @bar() nounwind
+  %0 = tail call double (...) @bar() nounwind
   %1 = fptrunc double %0 to float
   %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
   %3 = fadd float %1, %2
diff --git a/test/CodeGen/AArch64/arm64-join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll
index e99168b..dee0344 100644
--- a/test/CodeGen/AArch64/arm64-join-reserved.ll
+++ b/test/CodeGen/AArch64/arm64-join-reserved.ll
@@ -10,7 +10,7 @@ target triple = "arm64-apple-macosx10"
 ; CHECK: ret
 define void @g() nounwind ssp {
 entry:
-  tail call void (i32, ...)* @f(i32 0, i32 0) nounwind
+  tail call void (i32, ...) @f(i32 0, i32 0) nounwind
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
new file mode 100644
index 0000000..5bc4d71
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=arm64-apple-ios -aarch64-strict-align < %s | FileCheck %s
+
+; Small (16-bytes here) unaligned memcpys should stay memcpy calls if
+; strict-alignment is turned on.
+define void @t0(i8* %out, i8* %in) {
+; CHECK-LABEL: t0:
+; CHECK:         orr w2, wzr, #0x10
+; CHECK-NEXT:    bl _memcpy
+entry:
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 16, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index 4a92c3d..b74a406 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1086,7 +1086,7 @@ define <2 x i32> @test_concat_diff_v1i32_v1i32(i32 %a, i32 %b) {
 ; CHECK-LABEL: test_concat_diff_v1i32_v1i32:
 ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
 ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
-; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: ins {{v[0-9]+}}.s[1], w{{[0-9]+}}
 entry:
   %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
   %d = insertelement <2 x i32> undef, i32 %c, i32 0
diff --git a/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
new file mode 100644
index 0000000..51ed8a1
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -0,0 +1,456 @@
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V8a
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=generic | FileCheck %s --check-prefix=CHECK-V81a
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
+
+declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
+
+declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
+
+;-----------------------------------------------------------------------------
+; RDMA Vector
+; test for SIMDThreeSameVectorSQRDMLxHTiedHS
+
+define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
+   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
+; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.4h
+; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2
+   ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
+; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.8h
+; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
+   ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
+; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.2s
+; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2
+   ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V81:        sqrdmulh    v1.4s, v1.4s, v2.4s
+; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.4s
+; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2
+   ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
+   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
+; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.4h
+; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2
+   ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
+; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.8h
+; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2
+   ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
+; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.2s
+; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2
+   ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.4s
+; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.4s
+; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2
+   ret <4 x i32> %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Vector, by element
+; tests for vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
+
+define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlah_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a :       sqrdmulh    v1.4h, v1.4h, v2.h[3]
+; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.h[3]
+; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2[3]
+  ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlahq_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
+; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.h[2]
+; CHECK-V81a-apple: sqrdmlah.8h v0,    v1,    v2[2]
+  ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlah_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
+; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.s[1]
+; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2[1]
+  ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlahq_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
+; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.s[0]
+; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2[0]
+  ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlsh_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
+; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.h[3]
+; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2[3]
+  ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlshq_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
+; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.h[2]
+; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2[2]
+  ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlsh_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
+; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.s[1]
+; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2[1]
+  ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlshq_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
+; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.s[0]
+; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2[0]
+  ret <4 x i32> %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Vector, by element, extracted
+; i16 tests are for vXi16_indexed in SIMDIndexedSQRDMLxHSDTied, with IR in ACLE style
+; i32 tests are for   "def : Pat" in SIMDIndexedSQRDMLxHSDTied
+
+define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
+  %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlah.4h {{v[2-9]+}},    v0,    v1[1]
+  ret i16 %retval
+}
+
+define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
+  %retval = extractelement <8 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a:       sqrdmlah    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlah.8h {{v[2-9]+}},    v0,    v1[1]
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %extract = extractelement <2 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
+; CHECK-V81a:       sqrdmlah    v2.2s, v0.2s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlah.2s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %extract = extractelement <4 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
+; CHECK-V81a:       sqrdmlah    v2.4s, v0.4s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlah.4s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
+  %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.4h, v0.4h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlsh.4h {{v[2-9]+}},    v0,    v1[1]
+  ret i16 %retval
+}
+
+define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1, i32 1,i32 1,i32 1,i32 1>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
+  %retval = extractelement <8 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a:       sqrdmlsh    {{v[2-9]+}}.8h, v0.8h, v1.h[1]
+; CHECK-V81a-apple: sqrdmlsh.8h {{v[2-9]+}},    v0,    v1[1]
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %extract = extractelement <2 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
+; CHECK-V81a:       sqrdmlsh    v2.2s, v0.2s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlsh.2s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %extract = extractelement <4 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
+; CHECK-V81a:       sqrdmlsh    v2.4s, v0.4s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlsh.4s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Scalar
+; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
+
+define i16 @test_sqrdmlah_v1i16(i16 %acc, i16 %x, i16 %y) {
+; CHECK-LABEL: test_sqrdmlah_v1i16:
+  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
+  %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
+  %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
+  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
+  %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlah_v1i32(i32 %acc, i32 %x, i32 %y) {
+; CHECK-LABEL: test_sqrdmlah_v1i32:
+  %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
+  %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
+  %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
+  %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
+  %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
+  %retval = extractelement <4 x i32> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a-apple: sqrdmlah.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
+  ret i32 %retval
+}
+
+
+define i16 @test_sqrdmlsh_v1i16(i16 %acc, i16 %x, i16 %y) {
+; CHECK-LABEL: test_sqrdmlsh_v1i16:
+  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
+  %y_vec = insertelement <4 x i16> undef, i16 %y, i64 0
+  %prod_vec = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec,  <4 x i16> %y_vec)
+  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc_vec,  <4 x i16> %prod_vec)
+  %retval = extractelement <4 x i16> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-V81a-apple: sqrdmlsh.4h {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlsh_v1i32(i32 %acc, i32 %x, i32 %y) {
+; CHECK-LABEL: test_sqrdmlsh_v1i32:
+  %x_vec = insertelement <4 x i32> undef, i32 %x, i64 0
+  %y_vec = insertelement <4 x i32> undef, i32 %y, i64 0
+  %prod_vec = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x_vec,  <4 x i32> %y_vec)
+  %acc_vec = insertelement <4 x i32> undef, i32 %acc, i64 0
+  %retval_vec =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc_vec,  <4 x i32> %prod_vec)
+  %retval = extractelement <4 x i32> %retval_vec, i64 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-V81a-apple: sqrdmlsh.4s {{v[0-9]+}},    {{v[0-9]+}},    {{v[0-9]+}}
+  ret i32 %retval
+}
+define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
+; CHECK-LABEL: test_sqrdmlah_i32:
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a:       sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret i32 %retval
+}
+
+define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_i32:
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a:       sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret i32 %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Scalar, by element
+; i16 tests are performed via tests in above chapter, with IR in ACLE style
+; i32 tests are for i32_indexed in SIMDIndexedSQRDMLxHSDTied
+
+define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %x, <4 x i16> %y_vec) {
+; CHECK-LABEL: test_sqrdmlah_extract_i16:
+  %shuffle = shufflevector <4 x i16> %y_vec, <4 x i16> undef, <4 x i32> <i32 1,i32 1,i32 1,i32 1>
+  %x_vec = insertelement <4 x i16> undef, i16 %x, i64 0
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x_vec, <4 x i16> %shuffle)
+  %acc_vec = insertelement <4 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc_vec, <4 x i16> %prod)
+  %retval = extractelement <4 x i16> %retval_vec, i32 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
+; CHECK-V81a:       sqrdmlah    {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v0.h[1]
+; CHECK-V81a-apple: sqrdmlah.4h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_extract_i32:
+  %extract = extractelement <4 x i32> %rhs, i32 3
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a:       sqrdmlah   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
+  ret i32 %retval
+}
+
+define i16 @test_sqrdmlshq_extract_i16(i16 %acc, i16 %x, <8 x i16> %y_vec) {
+; CHECK-LABEL: test_sqrdmlshq_extract_i16:
+  %shuffle = shufflevector <8 x i16> %y_vec, <8 x i16> undef, <8 x i32> <i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1,i32 1>
+  %x_vec = insertelement <8 x i16> undef, i16 %x, i64 0
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x_vec, <8 x i16> %shuffle)
+  %acc_vec = insertelement <8 x i16> undef, i16 %acc, i64 0
+  %retval_vec =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc_vec, <8 x i16> %prod)
+  %retval = extractelement <8 x i16> %retval_vec, i32 0
+; CHECK-V8a:        sqrdmulh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
+; CHECK-V81a:       sqrdmlsh    {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v0.h[1]
+; CHECK-V81a-apple: sqrdmlsh.8h {{v[0-9]+}},    {{v[0-9]+}}, v0[1]
+  ret i16 %retval
+}
+
+define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_extract_i32:
+  %extract = extractelement <4 x i32> %rhs, i32 3
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a:       sqrdmlsh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
+  ret i32 %retval
+}
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
index 5a740d8..2651f11 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint-scratch-regs.ll
@@ -9,7 +9,7 @@
 define void @clobberScratch(i32* %p) {
   %v = load i32, i32* %p
   tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
   store i32 %v, i32* %p
   ret void
 }
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
index 8f79f80..b8236c5 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
@@ -23,9 +23,9 @@ entry:
 ; FAST-NEXT:   movk  x16, #0xbeef
 ; FAST-NEXT:   blr x16
   %resolveCall2 = inttoptr i64 281474417671919 to i8*
-  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
   %resolveCall3 = inttoptr i64 244837814038255 to i8*
-  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
   ret void
 }
 
@@ -59,7 +59,7 @@ entry:
 ; FAST-NEXT:   movk  x16, #0xbeef
 ; FAST-NEXT:   blr x16
   %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
   ret i64 %result
 }
 
@@ -101,7 +101,7 @@ entry:
 ; FAST-NEXT:   movk  x16, #0xbeef
 ; FAST-NEXT:   blr x16
   %call = inttoptr i64 281474417671919 to i8*
-  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
   ret i64 %result
 }
 
diff --git a/test/CodeGen/AArch64/arm64-patchpoint.ll b/test/CodeGen/AArch64/arm64-patchpoint.ll
index cf06653..d9ec7e5 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint.ll
@@ -16,9 +16,9 @@ entry:
 ; CHECK-NEXT:  blr  x16
 ; CHECK:       ret
   %resolveCall2 = inttoptr i64 244837814094590 to i8*
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   %resolveCall3 = inttoptr i64 244837814094591 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
   ret i64 %result
 }
 
@@ -38,7 +38,7 @@ entry:
   store i64 11, i64* %metadata
   store i64 12, i64* %metadata
   store i64 13, i64* %metadata
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
   ret void
 }
 
@@ -51,14 +51,14 @@ entry:
   %tmp80 = add i64 %tmp79, -16
   %tmp81 = inttoptr i64 %tmp80 to i64*
   %tmp82 = load i64, i64* %tmp81, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
   %tmp83 = load i64, i64* %tmp33, align 8
   %tmp84 = add i64 %tmp83, -24
   %tmp85 = inttoptr i64 %tmp84 to i64*
   %tmp86 = load i64, i64* %tmp85, align 8
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
   ret i64 10
 }
 
@@ -74,7 +74,7 @@ entry:
 ; CHECK-NEXT: nop
 ; CHECK-NEXT: ldp
 ; CHECK-NEXT: ret
-  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
+  %result = tail call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-stackmap-nops.ll b/test/CodeGen/AArch64/arm64-stackmap-nops.ll
index 5915b64..2647ac4 100644
--- a/test/CodeGen/AArch64/arm64-stackmap-nops.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap-nops.ll
@@ -8,7 +8,7 @@ entry:
 ; CHECK:      nop
 ; CHECK-NEXT: nop
 ; CHECK-NOT:  nop
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  0, i32  16)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64  0, i32  16)
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
index 29e4484..1a4df7a 100644
--- a/test/CodeGen/AArch64/arm64-stackmap.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll
@@ -78,7 +78,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define void @constantargs() {
 entry:
   %0 = inttoptr i64 244837814094590 to i8*
-  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  tail call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
   ret void
 }
 
@@ -100,7 +100,7 @@ entry:
   ; Runtime void->void call.
   call void inttoptr (i64 244837814094590 to void ()*)()
   ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
   ret void
 }
 
@@ -126,7 +126,7 @@ entry:
 cold:
   ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
   %thunk = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
   unreachable
 ret:
   ret void
@@ -142,7 +142,7 @@ ret:
 define i64 @propertyRead(i64* %obj) {
 entry:
   %resolveRead = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
+  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -162,7 +162,7 @@ entry:
 define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
 entry:
   %resolveWrite = inttoptr i64 244837814094590 to i8*
-  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
   ret void
 }
 
@@ -184,7 +184,7 @@ entry:
 define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 244837814094590 to i8*
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   ret void
 }
 
@@ -206,7 +206,7 @@ entry:
 define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 244837814094590 to i8*
-  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %result = call i64 (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -226,7 +226,7 @@ entry:
 ; CHECK-NEXT:   .short 29
 define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
 entry:
-  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+  call void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
   ret void
 }
 
@@ -245,7 +245,7 @@ entry:
 ; CHECK-NEXT:   .short 29
 define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
 entry:
-  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
   ret void
 }
 
@@ -263,7 +263,7 @@ entry:
 ; CHECK-NEXT:   .long   33
 
 define void @liveConstant() {
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
   ret void
 }
 
@@ -280,7 +280,7 @@ define void @liveConstant() {
 ; CHECK-NEXT:   .long   -{{[0-9]+}}
 define void @clobberLR(i32 %a) {
   tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind
-  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+  tail call void (i64, i32, ...) @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/arm64-vshuffle.ll b/test/CodeGen/AArch64/arm64-vshuffle.ll
index 75e0d80..15ea21b 100644
--- a/test/CodeGen/AArch64/arm64-vshuffle.ll
+++ b/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -1,22 +1,8 @@
 ; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -mcpu=cyclone | FileCheck %s
 
 
-; The mask:
-; CHECK: lCPI0_0:
-; CHECK:  .byte   2                       ; 0x2
-; CHECK:  .byte   255                     ; 0xff
-; CHECK:  .byte   6                       ; 0x6
-; CHECK:  .byte   255                     ; 0xff
-; The second vector is legalized to undef and the elements of the first vector
-; are used instead.
-; CHECK:  .byte   2                       ; 0x2
-; CHECK:  .byte   4                       ; 0x4
-; CHECK:  .byte   6                       ; 0x6
-; CHECK:  .byte   0                       ; 0x0
 ; CHECK: test1
-; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0
-; CHECK: movi.8h v[[REG1:[0-9]+]], #0x1, lsl #8
-; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+; CHECK: movi d[[REG0:[0-9]+]], #0000000000000000
 define <8 x i1> @test1() {
 entry:
   %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
@@ -30,18 +16,16 @@ entry:
 
 ; CHECK: lCPI1_0:
 ; CHECK:          .byte   0                       ; 0x0
-; CHECK:          .byte   255                     ; 0xff
-; CHECK:          .byte   2                       ; 0x2
-; CHECK:          .byte   255                     ; 0xff
-; CHECK:          .byte   10                      ; 0xa
-; CHECK:          .byte   12                      ; 0xc
-; CHECK:          .byte   14                      ; 0xe
-; CHECK:          .byte   7                       ; 0x7
+; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .byte   1                       ; 0x1
+; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .byte   0                       ; 0x0
+; CHECK:          .byte   0                       ; 0x0
 ; CHECK: test2
-; CHECK: ldr     d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF]
-; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_1@PAGE
-; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF]
-; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_0@PAGE
+; CHECK: ldr     d[[REG1:[0-9]+]], [x[[REG2]], lCPI1_0@PAGEOFF]
 define <8 x i1>@test2() {
 bb:
   %Shuff = shufflevector <8 x i1> zeroinitializer,
@@ -51,28 +35,8 @@ bb:
   ret <8 x i1> %Shuff
 }
 
-; CHECK: lCPI2_0:
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   255                     ; 0xff
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   0                       ; 0x0
 ; CHECK: test3
-; CHECK: adrp    x[[REG3:[0-9]+]], lCPI2_0@PAGE
-; CHECK: ldr     q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF]
-; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG3]], lCPI2_1@PAGEOFF]
-; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+; CHECK: movi.4s v{{[0-9]+}}, #0x1
 define <16 x i1> @test3(i1* %ptr, i32 %v) {
 bb:
   %Shuff = shufflevector <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i1> undef,
@@ -81,29 +45,26 @@ bb:
                  i32 14, i32 0>
   ret <16 x i1> %Shuff
 }
-; CHECK: lCPI3_1:
+; CHECK: lCPI3_0:
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
 ; CHECK:         .byte   0                       ; 0x0
 ; CHECK:         .byte   1                       ; 0x1
-; CHECK:         .byte   2                       ; 0x2
-; CHECK:         .byte   18                      ; 0x12
-; CHECK:         .byte   4                       ; 0x4
-; CHECK:         .byte   5                       ; 0x5
-; CHECK:         .byte   6                       ; 0x6
-; CHECK:         .byte   7                       ; 0x7
-; CHECK:         .byte   8                       ; 0x8
-; CHECK:         .byte   31                      ; 0x1f
-; CHECK:         .byte   10                      ; 0xa
-; CHECK:         .byte   30                      ; 0x1e
-; CHECK:         .byte   12                      ; 0xc
-; CHECK:         .byte   13                      ; 0xd
-; CHECK:         .byte   14                      ; 0xe
-; CHECK:         .byte   15                      ; 0xf
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   0                       ; 0x0
 ; CHECK: _test4:
-; CHECK:         ldr     q[[REG1:[0-9]+]]
-; CHECK:         movi.2d v[[REG0:[0-9]+]], #0000000000000000
-; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_1@PAGE
-; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF]
-; CHECK:         tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]]
+; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_0@PAGE
+; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_0@PAGEOFF]
 define <16 x i1> @test4(i1* %ptr, i32 %v) {
 bb:
   %Shuff = shufflevector <16 x i1> zeroinitializer,
diff --git a/test/CodeGen/AArch64/bitcast.ll b/test/CodeGen/AArch64/bitcast.ll
new file mode 100644
index 0000000..e88ea9e
--- /dev/null
+++ b/test/CodeGen/AArch64/bitcast.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s
+
+; PR23065: SCALAR_TO_VECTOR implies the top elements 1 to N-1 of the N-element vector are undefined.
+
+define <4 x i16> @foo1(<2 x i32> %a) {
+; CHECK-LABEL: foo1:
+; CHECK:       movi	d0, #0000000000000000
+; CHECK-NEXT:  ret
+
+  %1 = shufflevector <2 x i32> <i32 58712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
+; Can't optimize the following bitcast to scalar_to_vector.
+  %2 = bitcast <2 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  ret <4 x i16> %3
+}
+
+define <4 x i16> @foo2(<2 x i32> %a) {
+; CHECK-LABEL: foo2:
+; CHECK:       movi	d0, #0000000000000000
+; CHECK-NEXT:  ret
+
+  %1 = shufflevector <2 x i32> <i32 712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
+; Can't optimize the following bitcast to scalar_to_vector.
+  %2 = bitcast <2 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  ret <4 x i16> %3
+}
diff --git a/test/CodeGen/AArch64/br-to-eh-lpad.ll b/test/CodeGen/AArch64/br-to-eh-lpad.ll
index e948b87..f304ba4 100644
--- a/test/CodeGen/AArch64/br-to-eh-lpad.ll
+++ b/test/CodeGen/AArch64/br-to-eh-lpad.ll
@@ -30,12 +30,12 @@ invoke.cont7:
   unreachable
 
 if.end50.thread:
-  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
-  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 128)
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 128)
   unreachable
 
 invoke.cont33:
-  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 119)
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 119)
   unreachable
 
 invoke.cont41:
@@ -51,7 +51,7 @@ lpad40:
   br label %finally.catchall
 
 finally.catchall:
-  tail call void (i8*, ...)* @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([17 x i8], [17 x i8]* @.str1, i64 0, i64 0), i32 125)
   unreachable
 }
 
diff --git a/test/CodeGen/AArch64/concat_vector-scalar-combine.ll b/test/CodeGen/AArch64/concat_vector-scalar-combine.ll
new file mode 100644
index 0000000..1c64af6
--- /dev/null
+++ b/test/CodeGen/AArch64/concat_vector-scalar-combine.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Test the (concat_vectors (bitcast (scalar)), ..) pattern.
+
+define <8 x i8> @test_concat_scalar_v2i8_to_v8i8_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_v2i8_to_v8i8_dup:
+; CHECK-NEXT: dup.4h v0, w0
+; CHECK-NEXT: ret
+  %t = trunc i32 %x to i16
+  %0 = bitcast i16 %t to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  ret <8 x i8> %1
+}
+
+define <8 x i8> @test_concat_scalar_v4i8_to_v8i8_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_v4i8_to_v8i8_dup:
+; CHECK-NEXT: dup.2s v0, w0
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <4 x i8>
+  %1 = shufflevector <4 x i8> %0, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @test_concat_scalar_v2i16_to_v8i16_dup(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_v2i16_to_v8i16_dup:
+; CHECK-NEXT: dup.4s v0, w0
+; CHECK-NEXT: ret
+  %0 = bitcast i32 %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 0, i32 1, i32 0, i32 1>
+  ret <8 x i16> %1
+}
+
+define <8 x i8> @test_concat_scalars_2x_v2i8_to_v8i8(i32 %x, i32 %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_2x_v2i8_to_v8i8:
+; CHECK-NEXT: ins.h v0[0], w0
+; CHECK-NEXT: ins.h v0[1], w1
+; CHECK-NEXT: ins.h v0[3], w1
+; CHECK-NEXT: ret
+  %tx = trunc i32 %x to i16
+  %ty = trunc i32 %y to i16
+  %bx = bitcast i16 %tx to <2 x i8>
+  %by = bitcast i16 %ty to <2 x i8>
+  %r = shufflevector <2 x i8> %bx, <2 x i8> %by, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 2, i32 3>
+  ret <8 x i8> %r
+}
+
+define <8 x i8> @test_concat_scalars_2x_v4i8_to_v8i8_dup(i32 %x, i32 %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_2x_v4i8_to_v8i8_dup:
+; CHECK-NEXT: fmov s0, w1
+; CHECK-NEXT: ins.s v0[1], w0
+; CHECK-NEXT: ret
+  %bx = bitcast i32 %x to <4 x i8>
+  %by = bitcast i32 %y to <4 x i8>
+  %r = shufflevector <4 x i8> %bx, <4 x i8> %by, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i8> %r
+}
+
+define <8 x i16> @test_concat_scalars_2x_v2i16_to_v8i16_dup(i32 %x, i32 %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_2x_v2i16_to_v8i16_dup:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ins.s v0[2], w1
+; CHECK-NEXT: ins.s v0[3], w0
+; CHECK-NEXT: ret
+  %bx = bitcast i32 %x to <2 x i16>
+  %by = bitcast i32 %y to <2 x i16>
+  %r = shufflevector <2 x i16> %bx, <2 x i16> %by, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1>
+  ret <8 x i16> %r
+}
+
+; Also make sure we minimize bitcasts.
+
+; This is a pretty artificial testcase: make sure we bitcast to floating-point
+; if any of the scalars is floating-point.
+define <8 x i8> @test_concat_scalars_mixed_2x_v2i8_to_v8i8(float %dummy, i32 %x, half %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_mixed_2x_v2i8_to_v8i8:
+; CHECK-NEXT: fmov s[[X:[0-9]+]], w0
+; CHECK-NEXT: ins.h v0[0], v[[X]][0]
+; CHECK-NEXT: ins.h v0[1], v1[0]
+; CHECK-NEXT: ins.h v0[2], v[[X]][0]
+; CHECK-NEXT: ins.h v0[3], v1[0]
+; CHECK-NEXT: ret
+  %t = trunc i32 %x to i16
+  %0 = bitcast i16 %t to <2 x i8>
+  %y0 = bitcast half %y to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> %y0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i8> %1
+}
+
+define <2 x float> @test_concat_scalars_fp_2x_v2i8_to_v8i8(float %dummy, half %x, half %y) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalars_fp_2x_v2i8_to_v8i8:
+; CHECK-NEXT: ins.h v0[0], v1[0]
+; CHECK-NEXT: ins.h v0[1], v2[0]
+; CHECK-NEXT: ins.h v0[2], v1[0]
+; CHECK-NEXT: ins.h v0[3], v2[0]
+; CHECK-NEXT: ret
+  %0 = bitcast half %x to <2 x i8>
+  %y0 = bitcast half %y to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> %y0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  %2 = bitcast <8 x i8> %1 to <2 x float>
+  ret <2 x float> %2
+}
+
+define <4 x float> @test_concat_scalar_fp_v2i16_to_v16i8_dup(float %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_scalar_fp_v2i16_to_v16i8_dup:
+; CHECK-NEXT: dup.4s v0, v0[0]
+; CHECK-NEXT: ret
+  %0 = bitcast float %x to <2 x i16>
+  %1 = shufflevector <2 x i16> %0, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 0, i32 1, i32 0, i32 1>
+  %2 = bitcast <8 x i16> %1 to <4 x float>
+  ret <4 x float> %2
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
index c510e27..ee52786 100644
--- a/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
+++ b/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
@@ -2,6 +2,8 @@
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
+; Test the (concat_vectors (trunc), (trunc)) pattern.
+
 define <4 x i16> @test_concat_truncate_v2i64_to_v4i16(<2 x i64> %a, <2 x i64> %b) #0 {
 entry:
 ; CHECK-LABEL: test_concat_truncate_v2i64_to_v4i16:
diff --git a/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll b/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll
new file mode 100644
index 0000000..eb6c80d
--- /dev/null
+++ b/test/CodeGen/AArch64/concat_vector-truncated-scalar-combine.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Test the (concat_vectors (bitcast (trunc (scalar))), undef..) pattern.
+
+define <8 x i8> @test_concat_from_truncated_scalar(i32 %x) #0 {
+entry:
+; CHECK-LABEL: test_concat_from_truncated_scalar:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ret
+  %t = trunc i32 %x to i16
+  %0 = bitcast i16 %t to <2 x i8>
+  %1 = shufflevector <2 x i8> %0, <2 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i8> %1
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/dag-combine-invaraints.ll b/test/CodeGen/AArch64/dag-combine-invaraints.ll
index 3614133..ac2d057 100644
--- a/test/CodeGen/AArch64/dag-combine-invaraints.ll
+++ b/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -20,7 +20,7 @@ main_:
   %DHSelect = select i1 %tmp8, i32 %tmp9, i32 %tmp10
   store i32 %DHSelect, i32* %i32X, align 4
   %tmp15 = load i32, i32* %i32X, align 4
-  %tmp17 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str2, i32 0, i32 0), i32 %tmp15)
+  %tmp17 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str2, i32 0, i32 0), i32 %tmp15)
   ret i32 0
 
 ; CHECK: main:
diff --git a/test/CodeGen/AArch64/f16-instructions.ll b/test/CodeGen/AArch64/f16-instructions.ll
new file mode 100644
index 0000000..be5e2e5
--- /dev/null
+++ b/test/CodeGen/AArch64/f16-instructions.ll
@@ -0,0 +1,765 @@
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_fadd:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fadd(half %a, half %b) #0 {
+  %r = fadd half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fsub:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fsub s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fsub(half %a, half %b) #0 {
+  %r = fsub half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fmul:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fmul(half %a, half %b) #0 {
+  %r = fmul half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fdiv:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fdiv s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fdiv(half %a, half %b) #0 {
+  %r = fdiv half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_frem:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}fmodf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_frem(half %a, half %b) #0 {
+  %r = frem half %a, %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_store:
+; CHECK-NEXT: str  h0, [x0]
+; CHECK-NEXT: ret
+define void @test_store(half %a, half* %b) #0 {
+  store half %a, half* %b
+  ret void
+}
+
+; CHECK-LABEL: test_load:
+; CHECK-NEXT: ldr  h0, [x0]
+; CHECK-NEXT: ret
+define half @test_load(half* %a) #0 {
+  %r = load half, half* %a
+  ret half %r
+}
+
+
+declare half @test_callee(half %a, half %b) #0
+
+; CHECK-LABEL: test_call:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: bl {{_?}}test_callee
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_call(half %a, half %b) #0 {
+  %r = call half @test_callee(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_call_flipped:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: mov.16b  v2, v0
+; CHECK-NEXT: mov.16b  v0, v1
+; CHECK-NEXT: mov.16b  v1, v2
+; CHECK-NEXT: bl {{_?}}test_callee
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_call_flipped(half %a, half %b) #0 {
+  %r = call half @test_callee(half %b, half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_tailcall_flipped:
+; CHECK-NEXT: mov.16b  v2, v0
+; CHECK-NEXT: mov.16b  v0, v1
+; CHECK-NEXT: mov.16b  v1, v2
+; CHECK-NEXT: b {{_?}}test_callee
+define half @test_tailcall_flipped(half %a, half %b) #0 {
+  %r = tail call half @test_callee(half %b, half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_select:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: cmp  w0, #0
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_select(half %a, half %b, i1 zeroext %c) #0 {
+  %r = select i1 %c, half %a, half %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_select_cc:
+; CHECK-DAG: fcvt s3, h3
+; CHECK-DAG: fcvt s2, h2
+; CHECK-DAG: fcvt s1, h1
+; CHECK-DAG: fcvt s0, h0
+; CHECK-DAG: fcmp s2, s3
+; CHECK-DAG: cset [[CC:w[0-9]+]], ne
+; CHECK-DAG: cmp [[CC]], #0
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
+  %cc = fcmp une half %c, %d
+  %r = select i1 %cc, half %a, half %b
+  ret half %r
+}
+
+; CHECK-LABEL: test_fcmp_une:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, ne
+; CHECK-NEXT: ret
+define i1 @test_fcmp_une(half %a, half %b) #0 {
+  %r = fcmp une half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: orr [[TRUE:w[0-9]+]], wzr, #0x1
+; CHECK-NEXT: csel [[CC:w[0-9]+]], [[TRUE]], wzr, eq
+; CHECK-NEXT: csel w0, [[TRUE]], [[CC]], vs
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ueq(half %a, half %b) #0 {
+  %r = fcmp ueq half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, hi
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ugt(half %a, half %b) #0 {
+  %r = fcmp ugt half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, pl
+; CHECK-NEXT: ret
+define i1 @test_fcmp_uge(half %a, half %b) #0 {
+  %r = fcmp uge half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, lt
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ult(half %a, half %b) #0 {
+  %r = fcmp ult half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, le
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ule(half %a, half %b) #0 {
+  %r = fcmp ule half %a, %b
+  ret i1 %r
+}
+
+
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, vs
+; CHECK-NEXT: ret
+define i1 @test_fcmp_uno(half %a, half %b) #0 {
+  %r = fcmp uno half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_one:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: orr [[TRUE:w[0-9]+]], wzr, #0x1
+; CHECK-NEXT: csel [[CC:w[0-9]+]], [[TRUE]], wzr, mi
+; CHECK-NEXT: csel w0, [[TRUE]], [[CC]], gt
+; CHECK-NEXT: ret
+define i1 @test_fcmp_one(half %a, half %b) #0 {
+  %r = fcmp one half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, eq
+; CHECK-NEXT: ret
+define i1 @test_fcmp_oeq(half %a, half %b) #0 {
+  %r = fcmp oeq half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, gt
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ogt(half %a, half %b) #0 {
+  %r = fcmp ogt half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, ge
+; CHECK-NEXT: ret
+define i1 @test_fcmp_oge(half %a, half %b) #0 {
+  %r = fcmp oge half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, mi
+; CHECK-NEXT: ret
+define i1 @test_fcmp_olt(half %a, half %b) #0 {
+  %r = fcmp olt half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, ls
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ole(half %a, half %b) #0 {
+  %r = fcmp ole half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset  w0, vc
+; CHECK-NEXT: ret
+define i1 @test_fcmp_ord(half %a, half %b) #0 {
+  %r = fcmp ord half %a, %b
+  ret i1 %r
+}
+
+; CHECK-LABEL: test_br_cc:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: b.mi [[BRCC_ELSE:.?LBB[0-9_]+]]
+; CHECK-NEXT: str  wzr, [x0]
+; CHECK-NEXT: ret
+; CHECK-NEXT: [[BRCC_ELSE]]:
+; CHECK-NEXT: str  wzr, [x1]
+; CHECK-NEXT: ret
+define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 {
+  %c = fcmp uge half %a, %b
+  br i1 %c, label %then, label %else
+then:
+  store i32 0, i32* %p1
+  ret void
+else:
+  store i32 0, i32* %p2
+  ret void
+}
+
+; CHECK-LABEL: test_phi:
+; CHECK: mov  x[[PTR:[0-9]+]], x0
+; CHECK: ldr  h[[AB:[0-9]+]], [x[[PTR]]]
+; CHECK: [[LOOP:LBB[0-9_]+]]:
+; CHECK: mov.16b  v[[R:[0-9]+]], v[[AB]]
+; CHECK: ldr  h[[AB]], [x[[PTR]]]
+; CHECK: mov  x0, x[[PTR]]
+; CHECK: bl {{_?}}test_dummy
+; CHECK: mov.16b  v0, v[[R]]
+; CHECK: ret
+define half @test_phi(half* %p1) #0 {
+entry:
+  %a = load half, half* %p1
+  br label %loop
+loop:
+  %r = phi half [%a, %entry], [%b, %loop]
+  %b = load half, half* %p1
+  %c = call i1 @test_dummy(half* %p1)
+  br i1 %c, label %loop, label %return
+return:
+  ret half %r
+}
+declare i1 @test_dummy(half* %p1) #0
+
+; CHECK-LABEL: test_fptosi_i32:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzs w0, s0
+; CHECK-NEXT: ret
+define i32 @test_fptosi_i32(half %a) #0 {
+  %r = fptosi half %a to i32
+  ret i32 %r
+}
+
+; CHECK-LABEL: test_fptosi_i64:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzs x0, s0
+; CHECK-NEXT: ret
+define i64 @test_fptosi_i64(half %a) #0 {
+  %r = fptosi half %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: test_fptoui_i32:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzu w0, s0
+; CHECK-NEXT: ret
+define i32 @test_fptoui_i32(half %a) #0 {
+  %r = fptoui half %a to i32
+  ret i32 %r
+}
+
+; CHECK-LABEL: test_fptoui_i64:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvtzu x0, s0
+; CHECK-NEXT: ret
+define i64 @test_fptoui_i64(half %a) #0 {
+  %r = fptoui half %a to i64
+  ret i64 %r
+}
+
+; CHECK-LABEL: test_uitofp_i32:
+; CHECK-NEXT: ucvtf s0, w0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_uitofp_i32(i32 %a) #0 {
+  %r = uitofp i32 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_uitofp_i64:
+; CHECK-NEXT: ucvtf s0, x0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_uitofp_i64(i64 %a) #0 {
+  %r = uitofp i64 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i32:
+; CHECK-NEXT: scvtf s0, w0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_sitofp_i32(i32 %a) #0 {
+  %r = sitofp i32 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_sitofp_i64:
+; CHECK-NEXT: scvtf s0, x0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_sitofp_i64(i64 %a) #0 {
+  %r = sitofp i64 %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_float:
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+
+define half @test_fptrunc_float(float %a) #0 {
+  %r = fptrunc float %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_fptrunc_double:
+; CHECK-NEXT: fcvt h0, d0
+; CHECK-NEXT: ret
+define half @test_fptrunc_double(double %a) #0 {
+  %r = fptrunc double %a to half
+  ret half %r
+}
+
+; CHECK-LABEL: test_fpext_float:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: ret
+define float @test_fpext_float(half %a) #0 {
+  %r = fpext half %a to float
+  ret float %r
+}
+
+; CHECK-LABEL: test_fpext_double:
+; CHECK-NEXT: fcvt d0, h0
+; CHECK-NEXT: ret
+define double @test_fpext_double(half %a) #0 {
+  %r = fpext half %a to double
+  ret double %r
+}
+
+
+; CHECK-LABEL: test_bitcast_halftoi16:
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+define i16 @test_bitcast_halftoi16(half %a) #0 {
+  %r = bitcast half %a to i16
+  ret i16 %r
+}
+
+; CHECK-LABEL: test_bitcast_i16tohalf:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ret
+define half @test_bitcast_i16tohalf(i16 %a) #0 {
+  %r = bitcast i16 %a to half
+  ret half %r
+}
+
+
+declare half @llvm.sqrt.f16(half %a) #0
+declare half @llvm.powi.f16(half %a, i32 %b) #0
+declare half @llvm.sin.f16(half %a) #0
+declare half @llvm.cos.f16(half %a) #0
+declare half @llvm.pow.f16(half %a, half %b) #0
+declare half @llvm.exp.f16(half %a) #0
+declare half @llvm.exp2.f16(half %a) #0
+declare half @llvm.log.f16(half %a) #0
+declare half @llvm.log10.f16(half %a) #0
+declare half @llvm.log2.f16(half %a) #0
+declare half @llvm.fma.f16(half %a, half %b, half %c) #0
+declare half @llvm.fabs.f16(half %a) #0
+declare half @llvm.minnum.f16(half %a, half %b) #0
+declare half @llvm.maxnum.f16(half %a, half %b) #0
+declare half @llvm.copysign.f16(half %a, half %b) #0
+declare half @llvm.floor.f16(half %a) #0
+declare half @llvm.ceil.f16(half %a) #0
+declare half @llvm.trunc.f16(half %a) #0
+declare half @llvm.rint.f16(half %a) #0
+declare half @llvm.nearbyint.f16(half %a) #0
+declare half @llvm.round.f16(half %a) #0
+declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0
+
+; CHECK-LABEL: test_sqrt:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fsqrt s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_sqrt(half %a) #0 {
+  %r = call half @llvm.sqrt.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_powi:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}__powisf2
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_powi(half %a, i32 %b) #0 {
+  %r = call half @llvm.powi.f16(half %a, i32 %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_sin:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}sinf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_sin(half %a) #0 {
+  %r = call half @llvm.sin.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_cos:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}cosf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_cos(half %a) #0 {
+  %r = call half @llvm.cos.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_pow:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}powf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_pow(half %a, half %b) #0 {
+  %r = call half @llvm.pow.f16(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_exp:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}expf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_exp(half %a) #0 {
+  %r = call half @llvm.exp.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_exp2:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}exp2f
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_exp2(half %a) #0 {
+  %r = call half @llvm.exp2.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_log:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}logf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_log(half %a) #0 {
+  %r = call half @llvm.log.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_log10:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}log10f
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_log10(half %a) #0 {
+  %r = call half @llvm.log10.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_log2:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: bl {{_?}}log2f
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_log2(half %a) #0 {
+  %r = call half @llvm.log2.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_fma:
+; CHECK-NEXT: fcvt s2, h2
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fmadd s0, s0, s1, s2
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fma(half %a, half %b, half %c) #0 {
+  %r = call half @llvm.fma.f16(half %a, half %b, half %c)
+  ret half %r
+}
+
+; CHECK-LABEL: test_fabs:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fabs s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fabs(half %a) #0 {
+  %r = call half @llvm.fabs.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_minnum:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}fminf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_minnum(half %a, half %b) #0 {
+  %r = call half @llvm.minnum.f16(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_maxnum:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]!
+; CHECK-NEXT: mov  x29, sp
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: bl {{_?}}fmaxf
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ldp x29, x30, [sp], #16
+; CHECK-NEXT: ret
+define half @test_maxnum(half %a, half %b) #0 {
+  %r = call half @llvm.maxnum.f16(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_copysign:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: bit.16b v0, v1, v2
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_copysign(half %a, half %b) #0 {
+  %r = call half @llvm.copysign.f16(half %a, half %b)
+  ret half %r
+}
+
+; CHECK-LABEL: test_floor:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frintm s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_floor(half %a) #0 {
+  %r = call half @llvm.floor.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_ceil:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frintp s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_ceil(half %a) #0 {
+  %r = call half @llvm.ceil.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_trunc:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frintz s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_trunc(half %a) #0 {
+  %r = call half @llvm.trunc.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_rint:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: frintx s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_rint(half %a) #0 {
+  %r = call half @llvm.rint.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_nearbyint:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: frinti s0, s0
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_nearbyint(half %a) #0 {
+  %r = call half @llvm.nearbyint.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_round:
+; CHECK-NEXT: fcvt s1, h0
+; CHECK-NEXT: frinta s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_round(half %a) #0 {
+  %r = call half @llvm.round.f16(half %a)
+  ret half %r
+}
+
+; CHECK-LABEL: test_fmuladd:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fmul s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fcvt s1, h2
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_fmuladd(half %a, half %b, half %c) #0 {
+  %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
+  ret half %r
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/fast-isel-int-ext5.ll b/test/CodeGen/AArch64/fast-isel-int-ext5.ll
new file mode 100644
index 0000000..0f9ec62
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-int-ext5.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -O0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-LABEL: int_ext_opt
+define i64 @int_ext_opt(i8* %addr, i1 %c1, i1 %c2) {
+entry:
+  %0 = load i8, i8* %addr
+  br i1 %c1, label %bb1, label %bb2
+
+bb1:
+  %1 = zext i8 %0 to i64
+  br i1 %c2, label %bb2, label %exit
+
+bb2:
+  %2 = phi i64 [1, %entry], [%1, %bb1]
+  ret i64 %2
+
+exit:
+  ret i64 0
+}
diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll
new file mode 100644
index 0000000..2dd0d12
--- /dev/null
+++ b/test/CodeGen/AArch64/fold-constants.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+define i64 @dotests_616() {
+; CHECK-LABEL: dotests_616
+; CHECK:       movi d0, #0000000000000000
+; CHECK-NEXT:  umov w8, v0.b[2]
+; CHECK-NEXT:  sbfx w8, w8, #0, #1
+; CHECK-NEXT:  fmov s0, w8
+; CHECK-NEXT:  fmov x0, d0
+; CHECK-NEXT:  ret
+entry:
+  %0 = bitcast <2 x i64> zeroinitializer to <8 x i16>
+  %1 = and <8 x i16> zeroinitializer, %0
+  %2 = icmp ne <8 x i16> %1, zeroinitializer
+  %3 = extractelement <8 x i1> %2, i32 2
+  %vgetq_lane285 = sext i1 %3 to i16
+  %vset_lane = insertelement <4 x i16> undef, i16 %vgetq_lane285, i32 0
+  %4 = bitcast <4 x i16> %vset_lane to <1 x i64>
+  %vget_lane = extractelement <1 x i64> %4, i32 0
+  ret i64 %vget_lane
+}
diff --git a/test/CodeGen/AArch64/fp16-instructions.ll b/test/CodeGen/AArch64/fp16-instructions.ll
deleted file mode 100644
index ba96694..0000000
--- a/test/CodeGen/AArch64/fp16-instructions.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
-
-define half @add_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: add_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fadd [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
-  %0 = fadd half %a, %b
-  ret half %0
-}
-
-
-define half @sub_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: sub_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fsub [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
-  %0 = fsub half %a, %b
-  ret half %0
-}
-
-
-define half @mul_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: mul_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fmul [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
-  %0 = fmul half %a, %b
-  ret half %0
-}
-
-
-define half @div_h(half %a, half %b) {
-entry:
-; CHECK-LABEL: div_h:
-; CHECK-DAG: fcvt [[OP1:s[0-9]+]], h0
-; CHECK-DAG: fcvt [[OP2:s[0-9]+]], h1
-; CHECK: fdiv [[RES:s[0-9]+]], [[OP1]], [[OP2]]
-; CHECK: fcvt h0, [[RES]]
-  %0 = fdiv half %a, %b
-  ret half %0
-}
-
-
-define half @load_h(half* %a) {
-entry:
-; CHECK-LABEL: load_h:
-; CHECK: ldr h0, [x0]
-  %0 = load half, half* %a, align 4
-  ret half %0
-}
-
-
-define void @store_h(half* %a, half %b) {
-entry:
-; CHECK-LABEL: store_h:
-; CHECK: str h0, [x0]
-  store half %b, half* %a, align 4
-  ret void
-}
-
-define half @s_to_h(float %a) {
-; CHECK-LABEL: s_to_h:
-; CHECK: fcvt h0, s0
-  %1 = fptrunc float %a to half
-  ret half %1
-}
-
-define half @d_to_h(double %a) {
-; CHECK-LABEL: d_to_h:
-; CHECK: fcvt h0, d0
-  %1 = fptrunc double %a to half
-  ret half %1
-}
-
-define float @h_to_s(half %a) {
-; CHECK-LABEL: h_to_s:
-; CHECK: fcvt s0, h0
-  %1 = fpext half %a to float
-  ret float %1
-}
-
-define double @h_to_d(half %a) {
-; CHECK-LABEL: h_to_d:
-; CHECK: fcvt d0, h0
-  %1 = fpext half %a to double
-  ret double %1
-}
-
-define half @bitcast_i_to_h(i16 %a) {
-; CHECK-LABEL: bitcast_i_to_h:
-; CHECK: fmov s0, w0
-  %1 = bitcast i16 %a to half
-  ret half %1
-}
-
-
-define i16 @bitcast_h_to_i(half %a) {
-; CHECK-LABEL: bitcast_h_to_i:
-; CHECK: fmov w0, s0
-  %1 = bitcast half %a to i16
-  ret i16 %1
-}
diff --git a/test/CodeGen/AArch64/global-merge-1.ll b/test/CodeGen/AArch64/global-merge-1.ll
index b404389..14b0430 100644
--- a/test/CodeGen/AArch64/global-merge-1.ll
+++ b/test/CodeGen/AArch64/global-merge-1.ll
@@ -1,11 +1,11 @@
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
 
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
 
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
 
 @m = internal global i32 0, align 4
 @n = internal global i32 0, align 4
diff --git a/test/CodeGen/AArch64/global-merge-2.ll b/test/CodeGen/AArch64/global-merge-2.ll
index d5967b9..af68403 100644
--- a/test/CodeGen/AArch64/global-merge-2.ll
+++ b/test/CodeGen/AArch64/global-merge-2.ll
@@ -1,6 +1,6 @@
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
 
 @x = global i32 0, align 4
 @y = global i32 0, align 4
diff --git a/test/CodeGen/AArch64/global-merge-3.ll b/test/CodeGen/AArch64/global-merge-3.ll
index 15035c0..9251083 100644
--- a/test/CodeGen/AArch64/global-merge-3.ll
+++ b/test/CodeGen/AArch64/global-merge-3.ll
@@ -1,6 +1,6 @@
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-apple-ios -O3 -enable-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
 
 @x = global [1000 x i32] zeroinitializer, align 1
 @y = global [1000 x i32] zeroinitializer, align 1
diff --git a/test/CodeGen/AArch64/global-merge-4.ll b/test/CodeGen/AArch64/global-merge-4.ll
index 8fb7747..bc6b68a 100644
--- a/test/CodeGen/AArch64/global-merge-4.ll
+++ b/test/CodeGen/AArch64/global-merge-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -O3 -enable-global-merge -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -o - | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios7.0.0"
diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll
new file mode 100644
index 0000000..18dbad4
--- /dev/null
+++ b/test/CodeGen/AArch64/merge-store.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march aarch64 %s -o - | FileCheck %s
+
+@g0 = external global <3 x float>, align 16
+@g1 = external global <3 x float>, align 4
+
+; CHECK: ldr s[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]]{{\]}}, #4
+; CHECK: ld1{{\.?s?}} { v[[R0]]{{\.?s?}} }[1], {{\[}}[[R1]]{{\]}}
+; CHECK: str d[[R0]]
+
+define void @blam() {
+  %tmp4 = getelementptr inbounds <3 x float>, <3 x float>* @g1, i64 0, i64 0
+  %tmp5 = load <3 x float>, <3 x float>* @g0, align 16
+  %tmp6 = extractelement <3 x float> %tmp5, i64 0
+  store float %tmp6, float* %tmp4
+  %tmp7 = getelementptr inbounds float, float* %tmp4, i64 1
+  %tmp8 = load <3 x float>, <3 x float>* @g0, align 16
+  %tmp9 = extractelement <3 x float> %tmp8, i64 1
+  store float %tmp9, float* %tmp7
+  ret void;
+}
diff --git a/test/CodeGen/AArch64/print-mrs-system-register.ll b/test/CodeGen/AArch64/print-mrs-system-register.ll
new file mode 100644
index 0000000..3411ed6
--- /dev/null
+++ b/test/CodeGen/AArch64/print-mrs-system-register.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=arm64-apple-darwin %s -o - | FileCheck %s
+
+; CHECK: mrs x0, CPM_IOACC_CTL_EL3
+
+define void @foo1() #0 {
+entry:
+  tail call void asm sideeffect "mrs x0, cpm_ioacc_ctl_el3", ""()
+  ret void
+}
+
+attributes #0 = { "target-cpu"="cyclone" }
diff --git a/test/CodeGen/AArch64/sibling-call.ll b/test/CodeGen/AArch64/sibling-call.ll
index 34d45d8..a68fdec 100644
--- a/test/CodeGen/AArch64/sibling-call.ll
+++ b/test/CodeGen/AArch64/sibling-call.ll
@@ -75,8 +75,8 @@ define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
 
 ; CHECK: ldr [[VAL0:x[0-9]+]],
 ; CHECK: ldr [[VAL1:x[0-9]+]],
-; CHECK: str [[VAL1]],
 ; CHECK: str [[VAL0]],
+; CHECK: str [[VAL1]],
 
 ; CHECK-NOT: add sp, sp,
 ; CHECK: b callee_stack16
diff --git a/test/CodeGen/AArch64/stackmap-liveness.ll b/test/CodeGen/AArch64/stackmap-liveness.ll
new file mode 100644
index 0000000..6b37aac
--- /dev/null
+++ b/test/CodeGen/AArch64/stackmap-liveness.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=aarch64-apple-darwin | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:   __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 1
+; Num LargeConstants
+; CHECK-NEXT:   .long   0
+; Num Callsites
+; CHECK-NEXT:   .long   1
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _stackmap_liveness
+; CHECK-NEXT:   .quad 16
+
+; Test that the return register is recognized as an live-out.
+define i64 @stackmap_liveness(i1 %c) {
+; CHECK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+; Padding
+; CHECK-NEXT:   .short  0
+; Num LiveOut Entries: 1
+; CHECK-NEXT:   .short  2
+; LiveOut Entry 0: X0
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; LiveOut Entry 1: SP
+; CHECK-NEXT:   .short 31
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .byte 8
+; Align
+; CHECK-NEXT:   .align  3
+  %1 = select i1 %c, i64 1, i64 2
+  call anyregcc void (i64, i32, i8*, i32, ...) @llvm.experimental.patchpoint.void(i64 1, i32 32, i8* null, i32 0)
+  ret i64 %1
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+
diff --git a/test/CodeGen/AArch64/tailcall-explicit-sret.ll b/test/CodeGen/AArch64/tailcall-explicit-sret.ll
new file mode 100644
index 0000000..4d80f2a
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-explicit-sret.ll
@@ -0,0 +1,106 @@
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s
+; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Check that we don't try to tail-call with a non-forwarded sret parameter.
+declare void @test_explicit_sret(i1024* sret) #0
+
+; This is the only OK case, where we forward the explicit sret pointer.
+
+; CHECK-LABEL: _test_tailcall_explicit_sret:
+; CHECK-NEXT: b _test_explicit_sret
+define void @test_tailcall_explicit_sret(i1024* sret %arg) #0 {
+  tail call void @test_explicit_sret(i1024* %arg)
+  ret void
+}
+
+; CHECK-LABEL: _test_call_explicit_sret:
+; CHECK-NOT: mov  x8
+; CHECK: bl _test_explicit_sret
+; CHECK: ret
+define void @test_call_explicit_sret(i1024* sret %arg) #0 {
+  call void @test_explicit_sret(i1024* %arg)
+  ret void
+}
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_unused:
+; CHECK: mov  x8, sp
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK: ret
+define void @test_tailcall_explicit_sret_alloca_unused() #0 {
+  %l = alloca i1024, align 8
+  tail call void @test_explicit_sret(i1024* %l)
+  ret void
+}
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_dummyusers:
+; CHECK: ldr [[PTRLOAD1:x[0-9]+]], [x0]
+; CHECK: str [[PTRLOAD1]], [sp]
+; CHECK: mov  x8, sp
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK: ret
+define void @test_tailcall_explicit_sret_alloca_dummyusers(i1024* %ptr) #0 {
+  %l = alloca i1024, align 8
+  %r = load i1024, i1024* %ptr, align 8
+  store i1024 %r, i1024* %l, align 8
+  tail call void @test_explicit_sret(i1024* %l)
+  ret void
+}
+
+; This is too conservative, but doesn't really happen in practice.
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_gep:
+; CHECK: add  x8, x0, #128
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK: ret
+define void @test_tailcall_explicit_sret_gep(i1024* %ptr) #0 {
+  %ptr2 = getelementptr i1024, i1024* %ptr, i32 1
+  tail call void @test_explicit_sret(i1024* %ptr2)
+  ret void
+}
+
+; CHECK-LABEL: _test_tailcall_explicit_sret_alloca_returned:
+; CHECK: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov  x8, sp
+; CHECK-NEXT: bl _test_explicit_sret
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_tailcall_explicit_sret_alloca_returned() #0 {
+  %l = alloca i1024, align 8
+  tail call void @test_explicit_sret(i1024* %l)
+  %r = load i1024, i1024* %l, align 8
+  ret i1024 %r
+}
+
+; CHECK-LABEL: _test_indirect_tailcall_explicit_sret_nosret_arg:
+; CHECK-DAG: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK-DAG: mov  [[FPTR:x[0-9]+]], x0
+; CHECK: mov  x0, sp
+; CHECK-NEXT: blr [[FPTR]]
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define void @test_indirect_tailcall_explicit_sret_nosret_arg(i1024* sret %arg, void (i1024*)* %f) #0 {
+  %l = alloca i1024, align 8
+  tail call void %f(i1024* %l)
+  %r = load i1024, i1024* %l, align 8
+  store i1024 %r, i1024* %arg, align 8
+  ret void
+}
+
+; CHECK-LABEL: _test_indirect_tailcall_explicit_sret_:
+; CHECK: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov  x8, sp
+; CHECK-NEXT: blr x0
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define void @test_indirect_tailcall_explicit_sret_(i1024* sret %arg, i1024 ()* %f) #0 {
+  %ret = tail call i1024 %f()
+  store i1024 %ret, i1024* %arg, align 8
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/tailcall-implicit-sret.ll b/test/CodeGen/AArch64/tailcall-implicit-sret.ll
new file mode 100644
index 0000000..5d68059
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-implicit-sret.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s
+; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks.
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; Check that we don't try to tail-call with an sret-demoted return.
+
+declare i1024 @test_sret() #0
+
+; CHECK-LABEL: _test_call_sret:
+; CHECK: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov  x8, sp
+; CHECK-NEXT: bl _test_sret
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_call_sret() #0 {
+  %a = call i1024 @test_sret()
+  ret i1024 %a
+}
+
+; CHECK-LABEL: _test_tailcall_sret:
+; CHECK: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov  x8, sp
+; CHECK-NEXT: bl _test_sret
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_tailcall_sret() #0 {
+  %a = tail call i1024 @test_sret()
+  ret i1024 %a
+}
+
+; CHECK-LABEL: _test_indirect_tailcall_sret:
+; CHECK: mov  x[[CALLERX8NUM:[0-9]+]], x8
+; CHECK: mov  x8, sp
+; CHECK-NEXT: blr x0
+; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp]
+; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]]
+; CHECK: ret
+define i1024 @test_indirect_tailcall_sret(i1024 ()* %f) #0 {
+  %a = tail call i1024 %f()
+  ret i1024 %a
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll b/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll
new file mode 100644
index 0000000..b970fb1
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-mem-intrinsics.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
+
+; CHECK-LABEL: tail_memcpy:
+; CHECK: b memcpy
+define void @tail_memcpy(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memmove:
+; CHECK: b memmove
+define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
+entry:
+  tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: tail_memset:
+; CHECK: b memset
+define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
+entry:
+  tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #0
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/vcvt-oversize.ll b/test/CodeGen/AArch64/vcvt-oversize.ll
new file mode 100644
index 0000000..066a4b6
--- /dev/null
+++ b/test/CodeGen/AArch64/vcvt-oversize.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define <8 x i8> @float_to_i8(<8 x float>* %in) {
+; CHECK-LABEL: float_to_i8:
+; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s
+; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s
+; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s
+; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s
+; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s
+; CHECK-DAG: xtn2 v[[TMP]].8h, v[[MSB]].4s
+; CHECK-DAG: xtn v0.8b, v[[TMP]].8h
+  %l = load <8 x float>, <8 x float>* %in
+  %scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>
+  %conv = fptoui <8 x float> %scale to <8 x i8>
+  ret <8 x i8> %conv
+}